diff --git a/.gitattributes b/.gitattributes index 7df904c30b3d0c304b5a11e4428e9bffc5a4cdcc..d3576c47121d0441a1161f6d848060e400590a53 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2242,3 +2242,12 @@ gemma-2b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_g gemma-2b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-525-sd-10000/checkpoint-72/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-525-sd-10000/checkpoint-96/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-525-sd-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b9cd9d237e2902cda3565e5ce93b96f803222587 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:366052a3942a4a6232fad69aba123e05e8fa724863182bed58bcc2bcc5938c4c +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0ead780387cdf6c0f7e2ec6e2dbd4016ab0d03bb --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1848f3cb8d10a09370675a4cadd866ec33209db7666b36b2dc1fcb5a0ec31627 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..837fcfc98e6983cadb10c38b9a1afb966f8655a7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34a886dd6194b364ba5d9943917bb91218b13a26a6a5691ba17eb64f11521c4a +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4d039a4ebc65efdb335af3ce4ce8ade8d8350a98 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0f6bb64462c859a168ba6d7529c4775e9263227bc8150110b370455994f1d29 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9e34e4ef40424ffde20f9f3d3aea54458a01159 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebc2e8236b762b992d340199aa8465d7f043c8b186b252cf2d189b5682fd23b7 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0afcc4e007a7535e229ae9f1f9764566deecafbd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/trainer_state.json @@ -0,0 +1,1002 @@ +{ + "best_metric": 1.8243104219436646, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902", + "epoch": 3.0, + "eval_steps": 10, + "global_step": 1353, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022172949002217297, + "grad_norm": 0.6454975008964539, + "learning_rate": 0.0002, + "loss": 2.684, + "step": 10 + }, + { + "epoch": 0.04434589800443459, + "grad_norm": 0.5452715158462524, + "learning_rate": 0.0002, + "loss": 2.2694, + "step": 20 + }, + { + "epoch": 0.06651884700665188, + "grad_norm": 0.5502195358276367, + "learning_rate": 0.0002, + "loss": 2.0796, + "step": 30 + }, + { + "epoch": 0.08869179600886919, + "grad_norm": 0.48551198840141296, + "learning_rate": 0.0002, + "loss": 1.9132, + "step": 40 + }, + { + "epoch": 0.11086474501108648, + "grad_norm": 0.47822514176368713, + "learning_rate": 0.0002, + "loss": 2.016, + "step": 50 + }, + { + "epoch": 0.13303769401330376, + "grad_norm": 0.5125395655632019, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 60 + }, + { + "epoch": 0.15521064301552107, + "grad_norm": 0.4600693881511688, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 70 + }, + { + "epoch": 0.17738359201773837, + "grad_norm": 0.7180814743041992, + "learning_rate": 0.0002, + "loss": 2.0057, + "step": 80 + }, + { + "epoch": 0.19955654101995565, + "grad_norm": 0.4712974429130554, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 90 + }, + { + "epoch": 0.22172949002217296, + "grad_norm": 0.4673261344432831, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 100 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 0.4129070043563843, + "learning_rate": 0.0002, + "loss": 1.9346, + "step": 110 + }, + { + "epoch": 0.2660753880266075, + "grad_norm": 0.3859104812145233, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 120 + }, + { + "epoch": 0.28824833702882485, + "grad_norm": 0.40966713428497314, + "learning_rate": 0.0002, + "loss": 1.8922, + "step": 130 + }, + { + "epoch": 0.31042128603104213, + "grad_norm": 0.3685867488384247, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 140 + }, + { + "epoch": 0.3325942350332594, + "grad_norm": 0.39279988408088684, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 150 + }, + { + "epoch": 0.35476718403547675, + "grad_norm": 0.4195398986339569, + "learning_rate": 0.0002, + "loss": 1.8556, + "step": 160 + }, + { + "epoch": 0.376940133037694, + "grad_norm": 0.469802588224411, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 170 + }, + { + "epoch": 0.3991130820399113, + "grad_norm": 0.4069509208202362, + "learning_rate": 0.0002, + "loss": 1.8135, + "step": 180 + }, + { + "epoch": 0.4212860310421286, + "grad_norm": 0.47832027077674866, + "learning_rate": 0.0002, + "loss": 1.8429, + "step": 190 + }, + { + "epoch": 0.4434589800443459, + "grad_norm": 0.3376411199569702, + "learning_rate": 0.0002, + "loss": 1.781, + "step": 200 + }, + { + "epoch": 0.4656319290465632, + "grad_norm": 0.3787185847759247, + "learning_rate": 0.0002, + "loss": 1.8562, + "step": 210 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 0.40322697162628174, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 220 + }, + { + "epoch": 0.5099778270509978, + "grad_norm": 0.3710436522960663, + "learning_rate": 0.0002, + "loss": 1.8007, + "step": 230 + }, + { + "epoch": 0.532150776053215, + "grad_norm": 0.3723200261592865, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 240 + }, + { + "epoch": 0.5543237250554324, + "grad_norm": 0.3457179069519043, + "learning_rate": 0.0002, + "loss": 1.852, + "step": 250 + }, + { + "epoch": 0.5764966740576497, + "grad_norm": 0.35369473695755005, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 260 + }, + { + "epoch": 0.5986696230598669, + "grad_norm": 0.3667483329772949, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 270 + }, + { + "epoch": 0.6208425720620843, + "grad_norm": 0.4023273289203644, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 280 + }, + { + "epoch": 0.6430155210643016, + "grad_norm": 0.3601929843425751, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 290 + }, + { + "epoch": 0.6651884700665188, + "grad_norm": 0.32610392570495605, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 300 + }, + { + "epoch": 0.6873614190687362, + "grad_norm": 0.40528756380081177, + "learning_rate": 0.0002, + "loss": 1.8412, + "step": 310 + }, + { + "epoch": 0.7095343680709535, + "grad_norm": 0.34639739990234375, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 320 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 0.3794991374015808, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 330 + }, + { + "epoch": 0.753880266075388, + "grad_norm": 0.34203875064849854, + "learning_rate": 0.0002, + "loss": 1.8678, + "step": 340 + }, + { + "epoch": 0.7760532150776053, + "grad_norm": 0.36692821979522705, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 350 + }, + { + "epoch": 0.7982261640798226, + "grad_norm": 0.3701125979423523, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 360 + }, + { + "epoch": 0.8203991130820399, + "grad_norm": 0.3971416652202606, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 370 + }, + { + "epoch": 0.8425720620842572, + "grad_norm": 0.3751989006996155, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 380 + }, + { + "epoch": 0.8647450110864745, + "grad_norm": 0.35116496682167053, + "learning_rate": 0.0002, + "loss": 1.8281, + "step": 390 + }, + { + "epoch": 0.8869179600886918, + "grad_norm": 0.3672674894332886, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 400 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.34648260474205017, + "learning_rate": 0.0002, + "loss": 1.8293, + "step": 410 + }, + { + "epoch": 0.9312638580931264, + "grad_norm": 0.4497389793395996, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 420 + }, + { + "epoch": 0.9534368070953437, + "grad_norm": 0.33595147728919983, + "learning_rate": 0.0002, + "loss": 1.8405, + "step": 430 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 0.3130456805229187, + "learning_rate": 0.0002, + "loss": 1.8137, + "step": 440 + }, + { + "epoch": 0.9977827050997783, + "grad_norm": 0.36480239033699036, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8323718309402466, + "eval_runtime": 79.9603, + "eval_samples_per_second": 6.441, + "eval_steps_per_second": 0.813, + "step": 451 + }, + { + "epoch": 1.0199556541019956, + "grad_norm": 0.3840029835700989, + "learning_rate": 0.0002, + "loss": 1.7921, + "step": 460 + }, + { + "epoch": 1.042128603104213, + "grad_norm": 0.33457425236701965, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 470 + }, + { + "epoch": 1.06430155210643, + "grad_norm": 0.35766592621803284, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 480 + }, + { + "epoch": 1.0864745011086474, + "grad_norm": 0.38070937991142273, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 490 + }, + { + "epoch": 1.1086474501108647, + "grad_norm": 0.38546547293663025, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 500 + }, + { + "epoch": 1.130820399113082, + "grad_norm": 0.384104460477829, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 510 + }, + { + "epoch": 1.1529933481152994, + "grad_norm": 0.3556116819381714, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 520 + }, + { + "epoch": 1.1751662971175167, + "grad_norm": 0.4110541343688965, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 530 + }, + { + "epoch": 1.1973392461197339, + "grad_norm": 0.46503177285194397, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 540 + }, + { + "epoch": 1.2195121951219512, + "grad_norm": 0.4366816580295563, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 550 + }, + { + "epoch": 1.2416851441241685, + "grad_norm": 0.379986047744751, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 560 + }, + { + "epoch": 1.2638580931263859, + "grad_norm": 0.3920869529247284, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 570 + }, + { + "epoch": 1.2860310421286032, + "grad_norm": 0.4013986587524414, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 580 + }, + { + "epoch": 1.3082039911308203, + "grad_norm": 0.39104390144348145, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 590 + }, + { + "epoch": 1.3303769401330376, + "grad_norm": 0.40515613555908203, + "learning_rate": 0.0002, + "loss": 1.7822, + "step": 600 + }, + { + "epoch": 1.352549889135255, + "grad_norm": 0.4212331473827362, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 610 + }, + { + "epoch": 1.3747228381374723, + "grad_norm": 0.36040815711021423, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 620 + }, + { + "epoch": 1.3968957871396896, + "grad_norm": 0.3950865864753723, + "learning_rate": 0.0002, + "loss": 1.7467, + "step": 630 + }, + { + "epoch": 1.4190687361419068, + "grad_norm": 0.3934709131717682, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 640 + }, + { + "epoch": 1.441241685144124, + "grad_norm": 0.3905350863933563, + "learning_rate": 0.0002, + "loss": 1.7783, + "step": 650 + }, + { + "epoch": 1.4634146341463414, + "grad_norm": 0.4322686493396759, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 660 + }, + { + "epoch": 1.4855875831485588, + "grad_norm": 0.35697034001350403, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 670 + }, + { + "epoch": 1.507760532150776, + "grad_norm": 0.38570451736450195, + "learning_rate": 0.0002, + "loss": 1.6764, + "step": 680 + }, + { + "epoch": 1.5299334811529932, + "grad_norm": 0.3804517090320587, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 690 + }, + { + "epoch": 1.5521064301552108, + "grad_norm": 0.4938165247440338, + "learning_rate": 0.0002, + "loss": 1.7725, + "step": 700 + }, + { + "epoch": 1.5742793791574279, + "grad_norm": 0.43075236678123474, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 710 + }, + { + "epoch": 1.5964523281596452, + "grad_norm": 0.40434643626213074, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 720 + }, + { + "epoch": 1.6186252771618626, + "grad_norm": 0.3874157667160034, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 730 + }, + { + "epoch": 1.6407982261640797, + "grad_norm": 0.3645969331264496, + "learning_rate": 0.0002, + "loss": 1.8141, + "step": 740 + }, + { + "epoch": 1.6629711751662972, + "grad_norm": 0.38588255643844604, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 750 + }, + { + "epoch": 1.6851441241685143, + "grad_norm": 0.39252519607543945, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 760 + }, + { + "epoch": 1.7073170731707317, + "grad_norm": 0.36165162920951843, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 770 + }, + { + "epoch": 1.729490022172949, + "grad_norm": 0.36677947640419006, + "learning_rate": 0.0002, + "loss": 1.7172, + "step": 780 + }, + { + "epoch": 1.7516629711751663, + "grad_norm": 0.374881774187088, + "learning_rate": 0.0002, + "loss": 1.7737, + "step": 790 + }, + { + "epoch": 1.7738359201773837, + "grad_norm": 0.4530802369117737, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 800 + }, + { + "epoch": 1.7960088691796008, + "grad_norm": 0.3879568576812744, + "learning_rate": 0.0002, + "loss": 1.7799, + "step": 810 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.3710079789161682, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 820 + }, + { + "epoch": 1.8403547671840355, + "grad_norm": 0.3831799030303955, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 830 + }, + { + "epoch": 1.8625277161862528, + "grad_norm": 0.3958432376384735, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 840 + }, + { + "epoch": 1.8847006651884701, + "grad_norm": 0.4129294157028198, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 850 + }, + { + "epoch": 1.9068736141906872, + "grad_norm": 0.3714745044708252, + "learning_rate": 0.0002, + "loss": 1.6506, + "step": 860 + }, + { + "epoch": 1.9290465631929048, + "grad_norm": 0.40176868438720703, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 870 + }, + { + "epoch": 1.951219512195122, + "grad_norm": 0.36937767267227173, + "learning_rate": 0.0002, + "loss": 1.7557, + "step": 880 + }, + { + "epoch": 1.9733924611973392, + "grad_norm": 0.40242597460746765, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 890 + }, + { + "epoch": 1.9955654101995566, + "grad_norm": 0.3515510559082031, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 900 + }, + { + "epoch": 2.0, + "eval_loss": 1.8243104219436646, + "eval_runtime": 107.8856, + "eval_samples_per_second": 4.774, + "eval_steps_per_second": 0.602, + "step": 902 + }, + { + "epoch": 2.0177383592017737, + "grad_norm": 0.4145216643810272, + "learning_rate": 0.0002, + "loss": 1.6631, + "step": 910 + }, + { + "epoch": 2.0399113082039912, + "grad_norm": 0.42423519492149353, + "learning_rate": 0.0002, + "loss": 1.7227, + "step": 920 + }, + { + "epoch": 2.0620842572062084, + "grad_norm": 0.4773229956626892, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 930 + }, + { + "epoch": 2.084257206208426, + "grad_norm": 0.4144791066646576, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 940 + }, + { + "epoch": 2.106430155210643, + "grad_norm": 0.42704132199287415, + "learning_rate": 0.0002, + "loss": 1.6433, + "step": 950 + }, + { + "epoch": 2.12860310421286, + "grad_norm": 0.4479042589664459, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 960 + }, + { + "epoch": 2.1507760532150777, + "grad_norm": 0.4810638129711151, + "learning_rate": 0.0002, + "loss": 1.6122, + "step": 970 + }, + { + "epoch": 2.172949002217295, + "grad_norm": 0.48669910430908203, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 980 + }, + { + "epoch": 2.1951219512195124, + "grad_norm": 0.4252761900424957, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 990 + }, + { + "epoch": 2.2172949002217295, + "grad_norm": 0.42342790961265564, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 1000 + }, + { + "epoch": 2.2394678492239466, + "grad_norm": 0.43432456254959106, + "learning_rate": 0.0002, + "loss": 1.637, + "step": 1010 + }, + { + "epoch": 2.261640798226164, + "grad_norm": 0.45556965470314026, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 1020 + }, + { + "epoch": 2.2838137472283813, + "grad_norm": 0.48035719990730286, + "learning_rate": 0.0002, + "loss": 1.6554, + "step": 1030 + }, + { + "epoch": 2.305986696230599, + "grad_norm": 0.4233241081237793, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 1040 + }, + { + "epoch": 2.328159645232816, + "grad_norm": 0.3918434679508209, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 1050 + }, + { + "epoch": 2.3503325942350335, + "grad_norm": 0.44049757719039917, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 1060 + }, + { + "epoch": 2.3725055432372506, + "grad_norm": 0.4730056822299957, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 1070 + }, + { + "epoch": 2.3946784922394677, + "grad_norm": 0.4354589581489563, + "learning_rate": 0.0002, + "loss": 1.6104, + "step": 1080 + }, + { + "epoch": 2.4168514412416853, + "grad_norm": 0.4837590456008911, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 1090 + }, + { + "epoch": 2.4390243902439024, + "grad_norm": 0.4842571020126343, + "learning_rate": 0.0002, + "loss": 1.6672, + "step": 1100 + }, + { + "epoch": 2.4611973392461195, + "grad_norm": 0.46398279070854187, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 1110 + }, + { + "epoch": 2.483370288248337, + "grad_norm": 0.4587327539920807, + "learning_rate": 0.0002, + "loss": 1.6393, + "step": 1120 + }, + { + "epoch": 2.505543237250554, + "grad_norm": 0.4336528480052948, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 1130 + }, + { + "epoch": 2.5277161862527717, + "grad_norm": 0.6162153482437134, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1140 + }, + { + "epoch": 2.549889135254989, + "grad_norm": 0.48175573348999023, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1150 + }, + { + "epoch": 2.5720620842572064, + "grad_norm": 0.448272705078125, + "learning_rate": 0.0002, + "loss": 1.6098, + "step": 1160 + }, + { + "epoch": 2.5942350332594235, + "grad_norm": 0.5189200639724731, + "learning_rate": 0.0002, + "loss": 1.6987, + "step": 1170 + }, + { + "epoch": 2.6164079822616406, + "grad_norm": 0.45032963156700134, + "learning_rate": 0.0002, + "loss": 1.6503, + "step": 1180 + }, + { + "epoch": 2.638580931263858, + "grad_norm": 0.4417729377746582, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 1190 + }, + { + "epoch": 2.6607538802660753, + "grad_norm": 0.5219636559486389, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 1200 + }, + { + "epoch": 2.682926829268293, + "grad_norm": 0.47702011466026306, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 1210 + }, + { + "epoch": 2.70509977827051, + "grad_norm": 0.4328458607196808, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1220 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.46762076020240784, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 1230 + }, + { + "epoch": 2.7494456762749446, + "grad_norm": 0.4592697322368622, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 1240 + }, + { + "epoch": 2.7716186252771617, + "grad_norm": 0.5519265532493591, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 1250 + }, + { + "epoch": 2.7937915742793793, + "grad_norm": 0.47169506549835205, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1260 + }, + { + "epoch": 2.8159645232815964, + "grad_norm": 0.47231653332710266, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 1270 + }, + { + "epoch": 2.8381374722838135, + "grad_norm": 0.49081969261169434, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 1280 + }, + { + "epoch": 2.860310421286031, + "grad_norm": 0.4483231008052826, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 1290 + }, + { + "epoch": 2.882483370288248, + "grad_norm": 0.5310035943984985, + "learning_rate": 0.0002, + "loss": 1.6428, + "step": 1300 + }, + { + "epoch": 2.9046563192904657, + "grad_norm": 0.4419795572757721, + "learning_rate": 0.0002, + "loss": 1.6515, + "step": 1310 + }, + { + "epoch": 2.926829268292683, + "grad_norm": 0.44630762934684753, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 1320 + }, + { + "epoch": 2.9490022172949004, + "grad_norm": 0.39774850010871887, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 1330 + }, + { + "epoch": 2.9711751662971175, + "grad_norm": 0.441727876663208, + "learning_rate": 0.0002, + "loss": 1.7007, + "step": 1340 + }, + { + "epoch": 2.9933481152993346, + "grad_norm": 0.43773892521858215, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 1350 + }, + { + "epoch": 3.0, + "eval_loss": 1.8514760732650757, + "eval_runtime": 131.9812, + "eval_samples_per_second": 3.902, + "eval_steps_per_second": 0.492, + "step": 1353 + } + ], + "logging_steps": 10, + "max_steps": 3608, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.261381051029914e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcdc3b48752889d03e0bc8f748c28d3a3eeb1026 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1353/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4002b3199d379e5c6c8494417ad0ab221d2f73797ae8bbbb3b91f811953dbe5c +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4dd929e76d108f65afcd9407d33ea11ba4923107 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e042fa4f6660d10adb195d69f96a79dddc7d0323418703790e5d2b168c58cacd +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..895ac1646716fd751ee35046eed1360a59dcc939 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f03fe1da905004f22f2b294183057df97fbadfd144f47adb3396ebf3d0854ab0 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..aa4c9fa47f3177142d04a0497126c1f5dd9827f8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2926e67951a431b5fcc49a840fcae54f7eb556f0a1d2503f85fbed781dcba623 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7caac2c7bc413abab6814785ce15ea6f5e4d9f6b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f653fb5cef4c0e4265d115e5286ed87e9e5d683bb9526b5af74fa81c3c3a000 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..41de084975b7432041ba80ae2269253560e7a15c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/trainer_state.json @@ -0,0 +1,1325 @@ +{ + "best_metric": 1.8243104219436646, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 1804, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022172949002217297, + "grad_norm": 0.6454975008964539, + "learning_rate": 0.0002, + "loss": 2.684, + "step": 10 + }, + { + "epoch": 0.04434589800443459, + "grad_norm": 0.5452715158462524, + "learning_rate": 0.0002, + "loss": 2.2694, + "step": 20 + }, + { + "epoch": 0.06651884700665188, + "grad_norm": 0.5502195358276367, + "learning_rate": 0.0002, + "loss": 2.0796, + "step": 30 + }, + { + "epoch": 0.08869179600886919, + "grad_norm": 0.48551198840141296, + "learning_rate": 0.0002, + "loss": 1.9132, + "step": 40 + }, + { + "epoch": 0.11086474501108648, + "grad_norm": 0.47822514176368713, + "learning_rate": 0.0002, + "loss": 2.016, + "step": 50 + }, + { + "epoch": 0.13303769401330376, + "grad_norm": 0.5125395655632019, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 60 + }, + { + "epoch": 0.15521064301552107, + "grad_norm": 0.4600693881511688, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 70 + }, + { + "epoch": 0.17738359201773837, + "grad_norm": 0.7180814743041992, + "learning_rate": 0.0002, + "loss": 2.0057, + "step": 80 + }, + { + "epoch": 0.19955654101995565, + "grad_norm": 0.4712974429130554, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 90 + }, + { + "epoch": 0.22172949002217296, + "grad_norm": 0.4673261344432831, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 100 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 0.4129070043563843, + "learning_rate": 0.0002, + "loss": 1.9346, + "step": 110 + }, + { + "epoch": 0.2660753880266075, + "grad_norm": 0.3859104812145233, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 120 + }, + { + "epoch": 0.28824833702882485, + "grad_norm": 0.40966713428497314, + "learning_rate": 0.0002, + "loss": 1.8922, + "step": 130 + }, + { + "epoch": 0.31042128603104213, + "grad_norm": 0.3685867488384247, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 140 + }, + { + "epoch": 0.3325942350332594, + "grad_norm": 0.39279988408088684, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 150 + }, + { + "epoch": 0.35476718403547675, + "grad_norm": 0.4195398986339569, + "learning_rate": 0.0002, + "loss": 1.8556, + "step": 160 + }, + { + "epoch": 0.376940133037694, + "grad_norm": 0.469802588224411, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 170 + }, + { + "epoch": 0.3991130820399113, + "grad_norm": 0.4069509208202362, + "learning_rate": 0.0002, + "loss": 1.8135, + "step": 180 + }, + { + "epoch": 0.4212860310421286, + "grad_norm": 0.47832027077674866, + "learning_rate": 0.0002, + "loss": 1.8429, + "step": 190 + }, + { + "epoch": 0.4434589800443459, + "grad_norm": 0.3376411199569702, + "learning_rate": 0.0002, + "loss": 1.781, + "step": 200 + }, + { + "epoch": 0.4656319290465632, + "grad_norm": 0.3787185847759247, + "learning_rate": 0.0002, + "loss": 1.8562, + "step": 210 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 0.40322697162628174, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 220 + }, + { + "epoch": 0.5099778270509978, + "grad_norm": 0.3710436522960663, + "learning_rate": 0.0002, + "loss": 1.8007, + "step": 230 + }, + { + "epoch": 0.532150776053215, + "grad_norm": 0.3723200261592865, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 240 + }, + { + "epoch": 0.5543237250554324, + "grad_norm": 0.3457179069519043, + "learning_rate": 0.0002, + "loss": 1.852, + "step": 250 + }, + { + "epoch": 0.5764966740576497, + "grad_norm": 0.35369473695755005, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 260 + }, + { + "epoch": 0.5986696230598669, + "grad_norm": 0.3667483329772949, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 270 + }, + { + "epoch": 0.6208425720620843, + "grad_norm": 0.4023273289203644, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 280 + }, + { + "epoch": 0.6430155210643016, + "grad_norm": 0.3601929843425751, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 290 + }, + { + "epoch": 0.6651884700665188, + "grad_norm": 0.32610392570495605, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 300 + }, + { + "epoch": 0.6873614190687362, + "grad_norm": 0.40528756380081177, + "learning_rate": 0.0002, + "loss": 1.8412, + "step": 310 + }, + { + "epoch": 0.7095343680709535, + "grad_norm": 0.34639739990234375, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 320 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 0.3794991374015808, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 330 + }, + { + "epoch": 0.753880266075388, + "grad_norm": 0.34203875064849854, + "learning_rate": 0.0002, + "loss": 1.8678, + "step": 340 + }, + { + "epoch": 0.7760532150776053, + "grad_norm": 0.36692821979522705, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 350 + }, + { + "epoch": 0.7982261640798226, + "grad_norm": 0.3701125979423523, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 360 + }, + { + "epoch": 0.8203991130820399, + "grad_norm": 0.3971416652202606, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 370 + }, + { + "epoch": 0.8425720620842572, + "grad_norm": 0.3751989006996155, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 380 + }, + { + "epoch": 0.8647450110864745, + "grad_norm": 0.35116496682167053, + "learning_rate": 0.0002, + "loss": 1.8281, + "step": 390 + }, + { + "epoch": 0.8869179600886918, + "grad_norm": 0.3672674894332886, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 400 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.34648260474205017, + "learning_rate": 0.0002, + "loss": 1.8293, + "step": 410 + }, + { + "epoch": 0.9312638580931264, + "grad_norm": 0.4497389793395996, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 420 + }, + { + "epoch": 0.9534368070953437, + "grad_norm": 0.33595147728919983, + "learning_rate": 0.0002, + "loss": 1.8405, + "step": 430 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 0.3130456805229187, + "learning_rate": 0.0002, + "loss": 1.8137, + "step": 440 + }, + { + "epoch": 0.9977827050997783, + "grad_norm": 0.36480239033699036, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8323718309402466, + "eval_runtime": 79.9603, + "eval_samples_per_second": 6.441, + "eval_steps_per_second": 0.813, + "step": 451 + }, + { + "epoch": 1.0199556541019956, + "grad_norm": 0.3840029835700989, + "learning_rate": 0.0002, + "loss": 1.7921, + "step": 460 + }, + { + "epoch": 1.042128603104213, + "grad_norm": 0.33457425236701965, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 470 + }, + { + "epoch": 1.06430155210643, + "grad_norm": 0.35766592621803284, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 480 + }, + { + "epoch": 1.0864745011086474, + "grad_norm": 0.38070937991142273, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 490 + }, + { + "epoch": 1.1086474501108647, + "grad_norm": 0.38546547293663025, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 500 + }, + { + "epoch": 1.130820399113082, + "grad_norm": 0.384104460477829, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 510 + }, + { + "epoch": 1.1529933481152994, + "grad_norm": 0.3556116819381714, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 520 + }, + { + "epoch": 1.1751662971175167, + "grad_norm": 0.4110541343688965, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 530 + }, + { + "epoch": 1.1973392461197339, + "grad_norm": 0.46503177285194397, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 540 + }, + { + "epoch": 1.2195121951219512, + "grad_norm": 0.4366816580295563, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 550 + }, + { + "epoch": 1.2416851441241685, + "grad_norm": 0.379986047744751, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 560 + }, + { + "epoch": 1.2638580931263859, + "grad_norm": 0.3920869529247284, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 570 + }, + { + "epoch": 1.2860310421286032, + "grad_norm": 0.4013986587524414, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 580 + }, + { + "epoch": 1.3082039911308203, + "grad_norm": 0.39104390144348145, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 590 + }, + { + "epoch": 1.3303769401330376, + "grad_norm": 0.40515613555908203, + "learning_rate": 0.0002, + "loss": 1.7822, + "step": 600 + }, + { + "epoch": 1.352549889135255, + "grad_norm": 0.4212331473827362, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 610 + }, + { + "epoch": 1.3747228381374723, + "grad_norm": 0.36040815711021423, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 620 + }, + { + "epoch": 1.3968957871396896, + "grad_norm": 0.3950865864753723, + "learning_rate": 0.0002, + "loss": 1.7467, + "step": 630 + }, + { + "epoch": 1.4190687361419068, + "grad_norm": 0.3934709131717682, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 640 + }, + { + "epoch": 1.441241685144124, + "grad_norm": 0.3905350863933563, + "learning_rate": 0.0002, + "loss": 1.7783, + "step": 650 + }, + { + "epoch": 1.4634146341463414, + "grad_norm": 0.4322686493396759, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 660 + }, + { + "epoch": 1.4855875831485588, + "grad_norm": 0.35697034001350403, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 670 + }, + { + "epoch": 1.507760532150776, + "grad_norm": 0.38570451736450195, + "learning_rate": 0.0002, + "loss": 1.6764, + "step": 680 + }, + { + "epoch": 1.5299334811529932, + "grad_norm": 0.3804517090320587, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 690 + }, + { + "epoch": 1.5521064301552108, + "grad_norm": 0.4938165247440338, + "learning_rate": 0.0002, + "loss": 1.7725, + "step": 700 + }, + { + "epoch": 1.5742793791574279, + "grad_norm": 0.43075236678123474, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 710 + }, + { + "epoch": 1.5964523281596452, + "grad_norm": 0.40434643626213074, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 720 + }, + { + "epoch": 1.6186252771618626, + "grad_norm": 0.3874157667160034, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 730 + }, + { + "epoch": 1.6407982261640797, + "grad_norm": 0.3645969331264496, + "learning_rate": 0.0002, + "loss": 1.8141, + "step": 740 + }, + { + "epoch": 1.6629711751662972, + "grad_norm": 0.38588255643844604, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 750 + }, + { + "epoch": 1.6851441241685143, + "grad_norm": 0.39252519607543945, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 760 + }, + { + "epoch": 1.7073170731707317, + "grad_norm": 0.36165162920951843, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 770 + }, + { + "epoch": 1.729490022172949, + "grad_norm": 0.36677947640419006, + "learning_rate": 0.0002, + "loss": 1.7172, + "step": 780 + }, + { + "epoch": 1.7516629711751663, + "grad_norm": 0.374881774187088, + "learning_rate": 0.0002, + "loss": 1.7737, + "step": 790 + }, + { + "epoch": 1.7738359201773837, + "grad_norm": 0.4530802369117737, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 800 + }, + { + "epoch": 1.7960088691796008, + "grad_norm": 0.3879568576812744, + "learning_rate": 0.0002, + "loss": 1.7799, + "step": 810 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.3710079789161682, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 820 + }, + { + "epoch": 1.8403547671840355, + "grad_norm": 0.3831799030303955, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 830 + }, + { + "epoch": 1.8625277161862528, + "grad_norm": 0.3958432376384735, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 840 + }, + { + "epoch": 1.8847006651884701, + "grad_norm": 0.4129294157028198, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 850 + }, + { + "epoch": 1.9068736141906872, + "grad_norm": 0.3714745044708252, + "learning_rate": 0.0002, + "loss": 1.6506, + "step": 860 + }, + { + "epoch": 1.9290465631929048, + "grad_norm": 0.40176868438720703, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 870 + }, + { + "epoch": 1.951219512195122, + "grad_norm": 0.36937767267227173, + "learning_rate": 0.0002, + "loss": 1.7557, + "step": 880 + }, + { + "epoch": 1.9733924611973392, + "grad_norm": 0.40242597460746765, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 890 + }, + { + "epoch": 1.9955654101995566, + "grad_norm": 0.3515510559082031, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 900 + }, + { + "epoch": 2.0, + "eval_loss": 1.8243104219436646, + "eval_runtime": 107.8856, + "eval_samples_per_second": 4.774, + "eval_steps_per_second": 0.602, + "step": 902 + }, + { + "epoch": 2.0177383592017737, + "grad_norm": 0.4145216643810272, + "learning_rate": 0.0002, + "loss": 1.6631, + "step": 910 + }, + { + "epoch": 2.0399113082039912, + "grad_norm": 0.42423519492149353, + "learning_rate": 0.0002, + "loss": 1.7227, + "step": 920 + }, + { + "epoch": 2.0620842572062084, + "grad_norm": 0.4773229956626892, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 930 + }, + { + "epoch": 2.084257206208426, + "grad_norm": 0.4144791066646576, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 940 + }, + { + "epoch": 2.106430155210643, + "grad_norm": 0.42704132199287415, + "learning_rate": 0.0002, + "loss": 1.6433, + "step": 950 + }, + { + "epoch": 2.12860310421286, + "grad_norm": 0.4479042589664459, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 960 + }, + { + "epoch": 2.1507760532150777, + "grad_norm": 0.4810638129711151, + "learning_rate": 0.0002, + "loss": 1.6122, + "step": 970 + }, + { + "epoch": 2.172949002217295, + "grad_norm": 0.48669910430908203, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 980 + }, + { + "epoch": 2.1951219512195124, + "grad_norm": 0.4252761900424957, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 990 + }, + { + "epoch": 2.2172949002217295, + "grad_norm": 0.42342790961265564, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 1000 + }, + { + "epoch": 2.2394678492239466, + "grad_norm": 0.43432456254959106, + "learning_rate": 0.0002, + "loss": 1.637, + "step": 1010 + }, + { + "epoch": 2.261640798226164, + "grad_norm": 0.45556965470314026, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 1020 + }, + { + "epoch": 2.2838137472283813, + "grad_norm": 0.48035719990730286, + "learning_rate": 0.0002, + "loss": 1.6554, + "step": 1030 + }, + { + "epoch": 2.305986696230599, + "grad_norm": 0.4233241081237793, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 1040 + }, + { + "epoch": 2.328159645232816, + "grad_norm": 0.3918434679508209, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 1050 + }, + { + "epoch": 2.3503325942350335, + "grad_norm": 0.44049757719039917, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 1060 + }, + { + "epoch": 2.3725055432372506, + "grad_norm": 0.4730056822299957, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 1070 + }, + { + "epoch": 2.3946784922394677, + "grad_norm": 0.4354589581489563, + "learning_rate": 0.0002, + "loss": 1.6104, + "step": 1080 + }, + { + "epoch": 2.4168514412416853, + "grad_norm": 0.4837590456008911, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 1090 + }, + { + "epoch": 2.4390243902439024, + "grad_norm": 0.4842571020126343, + "learning_rate": 0.0002, + "loss": 1.6672, + "step": 1100 + }, + { + "epoch": 2.4611973392461195, + "grad_norm": 0.46398279070854187, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 1110 + }, + { + "epoch": 2.483370288248337, + "grad_norm": 0.4587327539920807, + "learning_rate": 0.0002, + "loss": 1.6393, + "step": 1120 + }, + { + "epoch": 2.505543237250554, + "grad_norm": 0.4336528480052948, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 1130 + }, + { + "epoch": 2.5277161862527717, + "grad_norm": 0.6162153482437134, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1140 + }, + { + "epoch": 2.549889135254989, + "grad_norm": 0.48175573348999023, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1150 + }, + { + "epoch": 2.5720620842572064, + "grad_norm": 0.448272705078125, + "learning_rate": 0.0002, + "loss": 1.6098, + "step": 1160 + }, + { + "epoch": 2.5942350332594235, + "grad_norm": 0.5189200639724731, + "learning_rate": 0.0002, + "loss": 1.6987, + "step": 1170 + }, + { + "epoch": 2.6164079822616406, + "grad_norm": 0.45032963156700134, + "learning_rate": 0.0002, + "loss": 1.6503, + "step": 1180 + }, + { + "epoch": 2.638580931263858, + "grad_norm": 0.4417729377746582, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 1190 + }, + { + "epoch": 2.6607538802660753, + "grad_norm": 0.5219636559486389, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 1200 + }, + { + "epoch": 2.682926829268293, + "grad_norm": 0.47702011466026306, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 1210 + }, + { + "epoch": 2.70509977827051, + "grad_norm": 0.4328458607196808, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1220 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.46762076020240784, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 1230 + }, + { + "epoch": 2.7494456762749446, + "grad_norm": 0.4592697322368622, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 1240 + }, + { + "epoch": 2.7716186252771617, + "grad_norm": 0.5519265532493591, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 1250 + }, + { + "epoch": 2.7937915742793793, + "grad_norm": 0.47169506549835205, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1260 + }, + { + "epoch": 2.8159645232815964, + "grad_norm": 0.47231653332710266, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 1270 + }, + { + "epoch": 2.8381374722838135, + "grad_norm": 0.49081969261169434, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 1280 + }, + { + "epoch": 2.860310421286031, + "grad_norm": 0.4483231008052826, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 1290 + }, + { + "epoch": 2.882483370288248, + "grad_norm": 0.5310035943984985, + "learning_rate": 0.0002, + "loss": 1.6428, + "step": 1300 + }, + { + "epoch": 2.9046563192904657, + "grad_norm": 0.4419795572757721, + "learning_rate": 0.0002, + "loss": 1.6515, + "step": 1310 + }, + { + "epoch": 2.926829268292683, + "grad_norm": 0.44630762934684753, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 1320 + }, + { + "epoch": 2.9490022172949004, + "grad_norm": 0.39774850010871887, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 1330 + }, + { + "epoch": 2.9711751662971175, + "grad_norm": 0.441727876663208, + "learning_rate": 0.0002, + "loss": 1.7007, + "step": 1340 + }, + { + "epoch": 2.9933481152993346, + "grad_norm": 0.43773892521858215, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 1350 + }, + { + "epoch": 3.0, + "eval_loss": 1.8514760732650757, + "eval_runtime": 131.9812, + "eval_samples_per_second": 3.902, + "eval_steps_per_second": 0.492, + "step": 1353 + }, + { + "epoch": 3.015521064301552, + "grad_norm": 0.5274150371551514, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 1360 + }, + { + "epoch": 3.0376940133037693, + "grad_norm": 0.5724489092826843, + "learning_rate": 0.0002, + "loss": 1.4784, + "step": 1370 + }, + { + "epoch": 3.059866962305987, + "grad_norm": 0.6182316541671753, + "learning_rate": 0.0002, + "loss": 1.5365, + "step": 1380 + }, + { + "epoch": 3.082039911308204, + "grad_norm": 0.5709688067436218, + "learning_rate": 0.0002, + "loss": 1.4824, + "step": 1390 + }, + { + "epoch": 3.104212860310421, + "grad_norm": 0.6368464231491089, + "learning_rate": 0.0002, + "loss": 1.534, + "step": 1400 + }, + { + "epoch": 3.1263858093126387, + "grad_norm": 0.5680432319641113, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 1410 + }, + { + "epoch": 3.1485587583148558, + "grad_norm": 0.5805315375328064, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 1420 + }, + { + "epoch": 3.1707317073170733, + "grad_norm": 0.5782836675643921, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 1430 + }, + { + "epoch": 3.1929046563192904, + "grad_norm": 0.627159595489502, + "learning_rate": 0.0002, + "loss": 1.4852, + "step": 1440 + }, + { + "epoch": 3.2150776053215075, + "grad_norm": 0.6136751174926758, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 1450 + }, + { + "epoch": 3.237250554323725, + "grad_norm": 0.6319093108177185, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 1460 + }, + { + "epoch": 3.259423503325942, + "grad_norm": 0.7641780972480774, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 1470 + }, + { + "epoch": 3.2815964523281598, + "grad_norm": 0.6116001605987549, + "learning_rate": 0.0002, + "loss": 1.5514, + "step": 1480 + }, + { + "epoch": 3.303769401330377, + "grad_norm": 0.6024722456932068, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 1490 + }, + { + "epoch": 3.3259423503325944, + "grad_norm": 0.5941570997238159, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 1500 + }, + { + "epoch": 3.3481152993348116, + "grad_norm": 0.608369767665863, + "learning_rate": 0.0002, + "loss": 1.5104, + "step": 1510 + }, + { + "epoch": 3.3702882483370287, + "grad_norm": 0.5942065715789795, + "learning_rate": 0.0002, + "loss": 1.5494, + "step": 1520 + }, + { + "epoch": 3.3924611973392462, + "grad_norm": 0.6382330656051636, + "learning_rate": 0.0002, + "loss": 1.5426, + "step": 1530 + }, + { + "epoch": 3.4146341463414633, + "grad_norm": 0.5839648842811584, + "learning_rate": 0.0002, + "loss": 1.5479, + "step": 1540 + }, + { + "epoch": 3.436807095343681, + "grad_norm": 0.5627358555793762, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 1550 + }, + { + "epoch": 3.458980044345898, + "grad_norm": 0.6342151761054993, + "learning_rate": 0.0002, + "loss": 1.5679, + "step": 1560 + }, + { + "epoch": 3.481152993348115, + "grad_norm": 0.6370542645454407, + "learning_rate": 0.0002, + "loss": 1.5005, + "step": 1570 + }, + { + "epoch": 3.5033259423503327, + "grad_norm": 0.5974680185317993, + "learning_rate": 0.0002, + "loss": 1.541, + "step": 1580 + }, + { + "epoch": 3.52549889135255, + "grad_norm": 0.6197021007537842, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 1590 + }, + { + "epoch": 3.5476718403547673, + "grad_norm": 0.6413024067878723, + "learning_rate": 0.0002, + "loss": 1.5287, + "step": 1600 + }, + { + "epoch": 3.5698447893569845, + "grad_norm": 0.5878410339355469, + "learning_rate": 0.0002, + "loss": 1.5301, + "step": 1610 + }, + { + "epoch": 3.5920177383592016, + "grad_norm": 0.6485083103179932, + "learning_rate": 0.0002, + "loss": 1.4625, + "step": 1620 + }, + { + "epoch": 3.614190687361419, + "grad_norm": 0.5826634764671326, + "learning_rate": 0.0002, + "loss": 1.5373, + "step": 1630 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.8906663656234741, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 1640 + }, + { + "epoch": 3.658536585365854, + "grad_norm": 0.6288479566574097, + "learning_rate": 0.0002, + "loss": 1.5208, + "step": 1650 + }, + { + "epoch": 3.680709534368071, + "grad_norm": 0.6191049218177795, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 1660 + }, + { + "epoch": 3.7028824833702885, + "grad_norm": 0.5997978448867798, + "learning_rate": 0.0002, + "loss": 1.5043, + "step": 1670 + }, + { + "epoch": 3.7250554323725056, + "grad_norm": 0.6003038287162781, + "learning_rate": 0.0002, + "loss": 1.5654, + "step": 1680 + }, + { + "epoch": 3.7472283813747227, + "grad_norm": 0.5417194962501526, + "learning_rate": 0.0002, + "loss": 1.4941, + "step": 1690 + }, + { + "epoch": 3.7694013303769403, + "grad_norm": 0.6367442607879639, + "learning_rate": 0.0002, + "loss": 1.5541, + "step": 1700 + }, + { + "epoch": 3.7915742793791574, + "grad_norm": 0.6613120436668396, + "learning_rate": 0.0002, + "loss": 1.5483, + "step": 1710 + }, + { + "epoch": 3.8137472283813745, + "grad_norm": 0.6506749391555786, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 1720 + }, + { + "epoch": 3.835920177383592, + "grad_norm": 0.5478500723838806, + "learning_rate": 0.0002, + "loss": 1.5207, + "step": 1730 + }, + { + "epoch": 3.858093126385809, + "grad_norm": 0.7313215732574463, + "learning_rate": 0.0002, + "loss": 1.5619, + "step": 1740 + }, + { + "epoch": 3.8802660753880267, + "grad_norm": 0.5453857183456421, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 1750 + }, + { + "epoch": 3.902439024390244, + "grad_norm": 0.5983547568321228, + "learning_rate": 0.0002, + "loss": 1.4857, + "step": 1760 + }, + { + "epoch": 3.9246119733924614, + "grad_norm": 0.6471580266952515, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 1770 + }, + { + "epoch": 3.9467849223946785, + "grad_norm": 0.5833685398101807, + "learning_rate": 0.0002, + "loss": 1.461, + "step": 1780 + }, + { + "epoch": 3.9689578713968956, + "grad_norm": 0.5509327054023743, + "learning_rate": 0.0002, + "loss": 1.5014, + "step": 1790 + }, + { + "epoch": 3.991130820399113, + "grad_norm": 0.6021352410316467, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 1800 + }, + { + "epoch": 4.0, + "eval_loss": 1.901047945022583, + "eval_runtime": 82.2708, + "eval_samples_per_second": 6.26, + "eval_steps_per_second": 0.79, + "step": 1804 + } + ], + "logging_steps": 10, + "max_steps": 3608, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.348508068039885e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcdc3b48752889d03e0bc8f748c28d3a3eeb1026 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-1804/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4002b3199d379e5c6c8494417ad0ab221d2f73797ae8bbbb3b91f811953dbe5c +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..49c4cb797f5f6d1d993c2ed80e57dc7186e24e9e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cf07f9225c47acce8255df5b078acf18ee674c88f1a25edd83debf9fdc56eb4 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2afbee1e232bc0aad62e81dca2d2b8026ddbde7b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c8e824159b5967c2b850787f9db79443ce44412742c1011bb6fb699effd03d0 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..37aeba33b1eeb940baabab74e1d69321ba5fcf13 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0578ffb4b5d7465094efeb0f0665c814295778cc060c421d6dae3a4c22953b7d +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..89e3b702b246094b52dff5715c8d43d37833a159 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab03cd128ebe41f7f991669c20f8a615875e39d4f7862c39727123f0c40fde9e +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..960834839472022c517e15de79e17cc0ba3dc0fd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/trainer_state.json @@ -0,0 +1,1648 @@ +{ + "best_metric": 1.8243104219436646, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902", + "epoch": 5.0, + "eval_steps": 10, + "global_step": 2255, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022172949002217297, + "grad_norm": 0.6454975008964539, + "learning_rate": 0.0002, + "loss": 2.684, + "step": 10 + }, + { + "epoch": 0.04434589800443459, + "grad_norm": 0.5452715158462524, + "learning_rate": 0.0002, + "loss": 2.2694, + "step": 20 + }, + { + "epoch": 0.06651884700665188, + "grad_norm": 0.5502195358276367, + "learning_rate": 0.0002, + "loss": 2.0796, + "step": 30 + }, + { + "epoch": 0.08869179600886919, + "grad_norm": 0.48551198840141296, + "learning_rate": 0.0002, + "loss": 1.9132, + "step": 40 + }, + { + "epoch": 0.11086474501108648, + "grad_norm": 0.47822514176368713, + "learning_rate": 0.0002, + "loss": 2.016, + "step": 50 + }, + { + "epoch": 0.13303769401330376, + "grad_norm": 0.5125395655632019, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 60 + }, + { + "epoch": 0.15521064301552107, + "grad_norm": 0.4600693881511688, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 70 + }, + { + "epoch": 0.17738359201773837, + "grad_norm": 0.7180814743041992, + "learning_rate": 0.0002, + "loss": 2.0057, + "step": 80 + }, + { + "epoch": 0.19955654101995565, + "grad_norm": 0.4712974429130554, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 90 + }, + { + "epoch": 0.22172949002217296, + "grad_norm": 0.4673261344432831, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 100 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 0.4129070043563843, + "learning_rate": 0.0002, + "loss": 1.9346, + "step": 110 + }, + { + "epoch": 0.2660753880266075, + "grad_norm": 0.3859104812145233, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 120 + }, + { + "epoch": 0.28824833702882485, + "grad_norm": 0.40966713428497314, + "learning_rate": 0.0002, + "loss": 1.8922, + "step": 130 + }, + { + "epoch": 0.31042128603104213, + "grad_norm": 0.3685867488384247, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 140 + }, + { + "epoch": 0.3325942350332594, + "grad_norm": 0.39279988408088684, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 150 + }, + { + "epoch": 0.35476718403547675, + "grad_norm": 0.4195398986339569, + "learning_rate": 0.0002, + "loss": 1.8556, + "step": 160 + }, + { + "epoch": 0.376940133037694, + "grad_norm": 0.469802588224411, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 170 + }, + { + "epoch": 0.3991130820399113, + "grad_norm": 0.4069509208202362, + "learning_rate": 0.0002, + "loss": 1.8135, + "step": 180 + }, + { + "epoch": 0.4212860310421286, + "grad_norm": 0.47832027077674866, + "learning_rate": 0.0002, + "loss": 1.8429, + "step": 190 + }, + { + "epoch": 0.4434589800443459, + "grad_norm": 0.3376411199569702, + "learning_rate": 0.0002, + "loss": 1.781, + "step": 200 + }, + { + "epoch": 0.4656319290465632, + "grad_norm": 0.3787185847759247, + "learning_rate": 0.0002, + "loss": 1.8562, + "step": 210 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 0.40322697162628174, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 220 + }, + { + "epoch": 0.5099778270509978, + "grad_norm": 0.3710436522960663, + "learning_rate": 0.0002, + "loss": 1.8007, + "step": 230 + }, + { + "epoch": 0.532150776053215, + "grad_norm": 0.3723200261592865, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 240 + }, + { + "epoch": 0.5543237250554324, + "grad_norm": 0.3457179069519043, + "learning_rate": 0.0002, + "loss": 1.852, + "step": 250 + }, + { + "epoch": 0.5764966740576497, + "grad_norm": 0.35369473695755005, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 260 + }, + { + "epoch": 0.5986696230598669, + "grad_norm": 0.3667483329772949, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 270 + }, + { + "epoch": 0.6208425720620843, + "grad_norm": 0.4023273289203644, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 280 + }, + { + "epoch": 0.6430155210643016, + "grad_norm": 0.3601929843425751, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 290 + }, + { + "epoch": 0.6651884700665188, + "grad_norm": 0.32610392570495605, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 300 + }, + { + "epoch": 0.6873614190687362, + "grad_norm": 0.40528756380081177, + "learning_rate": 0.0002, + "loss": 1.8412, + "step": 310 + }, + { + "epoch": 0.7095343680709535, + "grad_norm": 0.34639739990234375, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 320 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 0.3794991374015808, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 330 + }, + { + "epoch": 0.753880266075388, + "grad_norm": 0.34203875064849854, + "learning_rate": 0.0002, + "loss": 1.8678, + "step": 340 + }, + { + "epoch": 0.7760532150776053, + "grad_norm": 0.36692821979522705, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 350 + }, + { + "epoch": 0.7982261640798226, + "grad_norm": 0.3701125979423523, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 360 + }, + { + "epoch": 0.8203991130820399, + "grad_norm": 0.3971416652202606, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 370 + }, + { + "epoch": 0.8425720620842572, + "grad_norm": 0.3751989006996155, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 380 + }, + { + "epoch": 0.8647450110864745, + "grad_norm": 0.35116496682167053, + "learning_rate": 0.0002, + "loss": 1.8281, + "step": 390 + }, + { + "epoch": 0.8869179600886918, + "grad_norm": 0.3672674894332886, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 400 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.34648260474205017, + "learning_rate": 0.0002, + "loss": 1.8293, + "step": 410 + }, + { + "epoch": 0.9312638580931264, + "grad_norm": 0.4497389793395996, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 420 + }, + { + "epoch": 0.9534368070953437, + "grad_norm": 0.33595147728919983, + "learning_rate": 0.0002, + "loss": 1.8405, + "step": 430 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 0.3130456805229187, + "learning_rate": 0.0002, + "loss": 1.8137, + "step": 440 + }, + { + "epoch": 0.9977827050997783, + "grad_norm": 0.36480239033699036, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8323718309402466, + "eval_runtime": 79.9603, + "eval_samples_per_second": 6.441, + "eval_steps_per_second": 0.813, + "step": 451 + }, + { + "epoch": 1.0199556541019956, + "grad_norm": 0.3840029835700989, + "learning_rate": 0.0002, + "loss": 1.7921, + "step": 460 + }, + { + "epoch": 1.042128603104213, + "grad_norm": 0.33457425236701965, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 470 + }, + { + "epoch": 1.06430155210643, + "grad_norm": 0.35766592621803284, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 480 + }, + { + "epoch": 1.0864745011086474, + "grad_norm": 0.38070937991142273, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 490 + }, + { + "epoch": 1.1086474501108647, + "grad_norm": 0.38546547293663025, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 500 + }, + { + "epoch": 1.130820399113082, + "grad_norm": 0.384104460477829, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 510 + }, + { + "epoch": 1.1529933481152994, + "grad_norm": 0.3556116819381714, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 520 + }, + { + "epoch": 1.1751662971175167, + "grad_norm": 0.4110541343688965, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 530 + }, + { + "epoch": 1.1973392461197339, + "grad_norm": 0.46503177285194397, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 540 + }, + { + "epoch": 1.2195121951219512, + "grad_norm": 0.4366816580295563, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 550 + }, + { + "epoch": 1.2416851441241685, + "grad_norm": 0.379986047744751, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 560 + }, + { + "epoch": 1.2638580931263859, + "grad_norm": 0.3920869529247284, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 570 + }, + { + "epoch": 1.2860310421286032, + "grad_norm": 0.4013986587524414, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 580 + }, + { + "epoch": 1.3082039911308203, + "grad_norm": 0.39104390144348145, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 590 + }, + { + "epoch": 1.3303769401330376, + "grad_norm": 0.40515613555908203, + "learning_rate": 0.0002, + "loss": 1.7822, + "step": 600 + }, + { + "epoch": 1.352549889135255, + "grad_norm": 0.4212331473827362, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 610 + }, + { + "epoch": 1.3747228381374723, + "grad_norm": 0.36040815711021423, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 620 + }, + { + "epoch": 1.3968957871396896, + "grad_norm": 0.3950865864753723, + "learning_rate": 0.0002, + "loss": 1.7467, + "step": 630 + }, + { + "epoch": 1.4190687361419068, + "grad_norm": 0.3934709131717682, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 640 + }, + { + "epoch": 1.441241685144124, + "grad_norm": 0.3905350863933563, + "learning_rate": 0.0002, + "loss": 1.7783, + "step": 650 + }, + { + "epoch": 1.4634146341463414, + "grad_norm": 0.4322686493396759, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 660 + }, + { + "epoch": 1.4855875831485588, + "grad_norm": 0.35697034001350403, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 670 + }, + { + "epoch": 1.507760532150776, + "grad_norm": 0.38570451736450195, + "learning_rate": 0.0002, + "loss": 1.6764, + "step": 680 + }, + { + "epoch": 1.5299334811529932, + "grad_norm": 0.3804517090320587, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 690 + }, + { + "epoch": 1.5521064301552108, + "grad_norm": 0.4938165247440338, + "learning_rate": 0.0002, + "loss": 1.7725, + "step": 700 + }, + { + "epoch": 1.5742793791574279, + "grad_norm": 0.43075236678123474, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 710 + }, + { + "epoch": 1.5964523281596452, + "grad_norm": 0.40434643626213074, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 720 + }, + { + "epoch": 1.6186252771618626, + "grad_norm": 0.3874157667160034, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 730 + }, + { + "epoch": 1.6407982261640797, + "grad_norm": 0.3645969331264496, + "learning_rate": 0.0002, + "loss": 1.8141, + "step": 740 + }, + { + "epoch": 1.6629711751662972, + "grad_norm": 0.38588255643844604, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 750 + }, + { + "epoch": 1.6851441241685143, + "grad_norm": 0.39252519607543945, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 760 + }, + { + "epoch": 1.7073170731707317, + "grad_norm": 0.36165162920951843, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 770 + }, + { + "epoch": 1.729490022172949, + "grad_norm": 0.36677947640419006, + "learning_rate": 0.0002, + "loss": 1.7172, + "step": 780 + }, + { + "epoch": 1.7516629711751663, + "grad_norm": 0.374881774187088, + "learning_rate": 0.0002, + "loss": 1.7737, + "step": 790 + }, + { + "epoch": 1.7738359201773837, + "grad_norm": 0.4530802369117737, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 800 + }, + { + "epoch": 1.7960088691796008, + "grad_norm": 0.3879568576812744, + "learning_rate": 0.0002, + "loss": 1.7799, + "step": 810 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.3710079789161682, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 820 + }, + { + "epoch": 1.8403547671840355, + "grad_norm": 0.3831799030303955, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 830 + }, + { + "epoch": 1.8625277161862528, + "grad_norm": 0.3958432376384735, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 840 + }, + { + "epoch": 1.8847006651884701, + "grad_norm": 0.4129294157028198, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 850 + }, + { + "epoch": 1.9068736141906872, + "grad_norm": 0.3714745044708252, + "learning_rate": 0.0002, + "loss": 1.6506, + "step": 860 + }, + { + "epoch": 1.9290465631929048, + "grad_norm": 0.40176868438720703, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 870 + }, + { + "epoch": 1.951219512195122, + "grad_norm": 0.36937767267227173, + "learning_rate": 0.0002, + "loss": 1.7557, + "step": 880 + }, + { + "epoch": 1.9733924611973392, + "grad_norm": 0.40242597460746765, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 890 + }, + { + "epoch": 1.9955654101995566, + "grad_norm": 0.3515510559082031, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 900 + }, + { + "epoch": 2.0, + "eval_loss": 1.8243104219436646, + "eval_runtime": 107.8856, + "eval_samples_per_second": 4.774, + "eval_steps_per_second": 0.602, + "step": 902 + }, + { + "epoch": 2.0177383592017737, + "grad_norm": 0.4145216643810272, + "learning_rate": 0.0002, + "loss": 1.6631, + "step": 910 + }, + { + "epoch": 2.0399113082039912, + "grad_norm": 0.42423519492149353, + "learning_rate": 0.0002, + "loss": 1.7227, + "step": 920 + }, + { + "epoch": 2.0620842572062084, + "grad_norm": 0.4773229956626892, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 930 + }, + { + "epoch": 2.084257206208426, + "grad_norm": 0.4144791066646576, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 940 + }, + { + "epoch": 2.106430155210643, + "grad_norm": 0.42704132199287415, + "learning_rate": 0.0002, + "loss": 1.6433, + "step": 950 + }, + { + "epoch": 2.12860310421286, + "grad_norm": 0.4479042589664459, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 960 + }, + { + "epoch": 2.1507760532150777, + "grad_norm": 0.4810638129711151, + "learning_rate": 0.0002, + "loss": 1.6122, + "step": 970 + }, + { + "epoch": 2.172949002217295, + "grad_norm": 0.48669910430908203, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 980 + }, + { + "epoch": 2.1951219512195124, + "grad_norm": 0.4252761900424957, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 990 + }, + { + "epoch": 2.2172949002217295, + "grad_norm": 0.42342790961265564, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 1000 + }, + { + "epoch": 2.2394678492239466, + "grad_norm": 0.43432456254959106, + "learning_rate": 0.0002, + "loss": 1.637, + "step": 1010 + }, + { + "epoch": 2.261640798226164, + "grad_norm": 0.45556965470314026, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 1020 + }, + { + "epoch": 2.2838137472283813, + "grad_norm": 0.48035719990730286, + "learning_rate": 0.0002, + "loss": 1.6554, + "step": 1030 + }, + { + "epoch": 2.305986696230599, + "grad_norm": 0.4233241081237793, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 1040 + }, + { + "epoch": 2.328159645232816, + "grad_norm": 0.3918434679508209, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 1050 + }, + { + "epoch": 2.3503325942350335, + "grad_norm": 0.44049757719039917, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 1060 + }, + { + "epoch": 2.3725055432372506, + "grad_norm": 0.4730056822299957, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 1070 + }, + { + "epoch": 2.3946784922394677, + "grad_norm": 0.4354589581489563, + "learning_rate": 0.0002, + "loss": 1.6104, + "step": 1080 + }, + { + "epoch": 2.4168514412416853, + "grad_norm": 0.4837590456008911, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 1090 + }, + { + "epoch": 2.4390243902439024, + "grad_norm": 0.4842571020126343, + "learning_rate": 0.0002, + "loss": 1.6672, + "step": 1100 + }, + { + "epoch": 2.4611973392461195, + "grad_norm": 0.46398279070854187, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 1110 + }, + { + "epoch": 2.483370288248337, + "grad_norm": 0.4587327539920807, + "learning_rate": 0.0002, + "loss": 1.6393, + "step": 1120 + }, + { + "epoch": 2.505543237250554, + "grad_norm": 0.4336528480052948, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 1130 + }, + { + "epoch": 2.5277161862527717, + "grad_norm": 0.6162153482437134, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1140 + }, + { + "epoch": 2.549889135254989, + "grad_norm": 0.48175573348999023, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1150 + }, + { + "epoch": 2.5720620842572064, + "grad_norm": 0.448272705078125, + "learning_rate": 0.0002, + "loss": 1.6098, + "step": 1160 + }, + { + "epoch": 2.5942350332594235, + "grad_norm": 0.5189200639724731, + "learning_rate": 0.0002, + "loss": 1.6987, + "step": 1170 + }, + { + "epoch": 2.6164079822616406, + "grad_norm": 0.45032963156700134, + "learning_rate": 0.0002, + "loss": 1.6503, + "step": 1180 + }, + { + "epoch": 2.638580931263858, + "grad_norm": 0.4417729377746582, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 1190 + }, + { + "epoch": 2.6607538802660753, + "grad_norm": 0.5219636559486389, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 1200 + }, + { + "epoch": 2.682926829268293, + "grad_norm": 0.47702011466026306, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 1210 + }, + { + "epoch": 2.70509977827051, + "grad_norm": 0.4328458607196808, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1220 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.46762076020240784, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 1230 + }, + { + "epoch": 2.7494456762749446, + "grad_norm": 0.4592697322368622, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 1240 + }, + { + "epoch": 2.7716186252771617, + "grad_norm": 0.5519265532493591, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 1250 + }, + { + "epoch": 2.7937915742793793, + "grad_norm": 0.47169506549835205, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1260 + }, + { + "epoch": 2.8159645232815964, + "grad_norm": 0.47231653332710266, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 1270 + }, + { + "epoch": 2.8381374722838135, + "grad_norm": 0.49081969261169434, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 1280 + }, + { + "epoch": 2.860310421286031, + "grad_norm": 0.4483231008052826, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 1290 + }, + { + "epoch": 2.882483370288248, + "grad_norm": 0.5310035943984985, + "learning_rate": 0.0002, + "loss": 1.6428, + "step": 1300 + }, + { + "epoch": 2.9046563192904657, + "grad_norm": 0.4419795572757721, + "learning_rate": 0.0002, + "loss": 1.6515, + "step": 1310 + }, + { + "epoch": 2.926829268292683, + "grad_norm": 0.44630762934684753, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 1320 + }, + { + "epoch": 2.9490022172949004, + "grad_norm": 0.39774850010871887, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 1330 + }, + { + "epoch": 2.9711751662971175, + "grad_norm": 0.441727876663208, + "learning_rate": 0.0002, + "loss": 1.7007, + "step": 1340 + }, + { + "epoch": 2.9933481152993346, + "grad_norm": 0.43773892521858215, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 1350 + }, + { + "epoch": 3.0, + "eval_loss": 1.8514760732650757, + "eval_runtime": 131.9812, + "eval_samples_per_second": 3.902, + "eval_steps_per_second": 0.492, + "step": 1353 + }, + { + "epoch": 3.015521064301552, + "grad_norm": 0.5274150371551514, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 1360 + }, + { + "epoch": 3.0376940133037693, + "grad_norm": 0.5724489092826843, + "learning_rate": 0.0002, + "loss": 1.4784, + "step": 1370 + }, + { + "epoch": 3.059866962305987, + "grad_norm": 0.6182316541671753, + "learning_rate": 0.0002, + "loss": 1.5365, + "step": 1380 + }, + { + "epoch": 3.082039911308204, + "grad_norm": 0.5709688067436218, + "learning_rate": 0.0002, + "loss": 1.4824, + "step": 1390 + }, + { + "epoch": 3.104212860310421, + "grad_norm": 0.6368464231491089, + "learning_rate": 0.0002, + "loss": 1.534, + "step": 1400 + }, + { + "epoch": 3.1263858093126387, + "grad_norm": 0.5680432319641113, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 1410 + }, + { + "epoch": 3.1485587583148558, + "grad_norm": 0.5805315375328064, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 1420 + }, + { + "epoch": 3.1707317073170733, + "grad_norm": 0.5782836675643921, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 1430 + }, + { + "epoch": 3.1929046563192904, + "grad_norm": 0.627159595489502, + "learning_rate": 0.0002, + "loss": 1.4852, + "step": 1440 + }, + { + "epoch": 3.2150776053215075, + "grad_norm": 0.6136751174926758, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 1450 + }, + { + "epoch": 3.237250554323725, + "grad_norm": 0.6319093108177185, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 1460 + }, + { + "epoch": 3.259423503325942, + "grad_norm": 0.7641780972480774, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 1470 + }, + { + "epoch": 3.2815964523281598, + "grad_norm": 0.6116001605987549, + "learning_rate": 0.0002, + "loss": 1.5514, + "step": 1480 + }, + { + "epoch": 3.303769401330377, + "grad_norm": 0.6024722456932068, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 1490 + }, + { + "epoch": 3.3259423503325944, + "grad_norm": 0.5941570997238159, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 1500 + }, + { + "epoch": 3.3481152993348116, + "grad_norm": 0.608369767665863, + "learning_rate": 0.0002, + "loss": 1.5104, + "step": 1510 + }, + { + "epoch": 3.3702882483370287, + "grad_norm": 0.5942065715789795, + "learning_rate": 0.0002, + "loss": 1.5494, + "step": 1520 + }, + { + "epoch": 3.3924611973392462, + "grad_norm": 0.6382330656051636, + "learning_rate": 0.0002, + "loss": 1.5426, + "step": 1530 + }, + { + "epoch": 3.4146341463414633, + "grad_norm": 0.5839648842811584, + "learning_rate": 0.0002, + "loss": 1.5479, + "step": 1540 + }, + { + "epoch": 3.436807095343681, + "grad_norm": 0.5627358555793762, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 1550 + }, + { + "epoch": 3.458980044345898, + "grad_norm": 0.6342151761054993, + "learning_rate": 0.0002, + "loss": 1.5679, + "step": 1560 + }, + { + "epoch": 3.481152993348115, + "grad_norm": 0.6370542645454407, + "learning_rate": 0.0002, + "loss": 1.5005, + "step": 1570 + }, + { + "epoch": 3.5033259423503327, + "grad_norm": 0.5974680185317993, + "learning_rate": 0.0002, + "loss": 1.541, + "step": 1580 + }, + { + "epoch": 3.52549889135255, + "grad_norm": 0.6197021007537842, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 1590 + }, + { + "epoch": 3.5476718403547673, + "grad_norm": 0.6413024067878723, + "learning_rate": 0.0002, + "loss": 1.5287, + "step": 1600 + }, + { + "epoch": 3.5698447893569845, + "grad_norm": 0.5878410339355469, + "learning_rate": 0.0002, + "loss": 1.5301, + "step": 1610 + }, + { + "epoch": 3.5920177383592016, + "grad_norm": 0.6485083103179932, + "learning_rate": 0.0002, + "loss": 1.4625, + "step": 1620 + }, + { + "epoch": 3.614190687361419, + "grad_norm": 0.5826634764671326, + "learning_rate": 0.0002, + "loss": 1.5373, + "step": 1630 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.8906663656234741, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 1640 + }, + { + "epoch": 3.658536585365854, + "grad_norm": 0.6288479566574097, + "learning_rate": 0.0002, + "loss": 1.5208, + "step": 1650 + }, + { + "epoch": 3.680709534368071, + "grad_norm": 0.6191049218177795, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 1660 + }, + { + "epoch": 3.7028824833702885, + "grad_norm": 0.5997978448867798, + "learning_rate": 0.0002, + "loss": 1.5043, + "step": 1670 + }, + { + "epoch": 3.7250554323725056, + "grad_norm": 0.6003038287162781, + "learning_rate": 0.0002, + "loss": 1.5654, + "step": 1680 + }, + { + "epoch": 3.7472283813747227, + "grad_norm": 0.5417194962501526, + "learning_rate": 0.0002, + "loss": 1.4941, + "step": 1690 + }, + { + "epoch": 3.7694013303769403, + "grad_norm": 0.6367442607879639, + "learning_rate": 0.0002, + "loss": 1.5541, + "step": 1700 + }, + { + "epoch": 3.7915742793791574, + "grad_norm": 0.6613120436668396, + "learning_rate": 0.0002, + "loss": 1.5483, + "step": 1710 + }, + { + "epoch": 3.8137472283813745, + "grad_norm": 0.6506749391555786, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 1720 + }, + { + "epoch": 3.835920177383592, + "grad_norm": 0.5478500723838806, + "learning_rate": 0.0002, + "loss": 1.5207, + "step": 1730 + }, + { + "epoch": 3.858093126385809, + "grad_norm": 0.7313215732574463, + "learning_rate": 0.0002, + "loss": 1.5619, + "step": 1740 + }, + { + "epoch": 3.8802660753880267, + "grad_norm": 0.5453857183456421, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 1750 + }, + { + "epoch": 3.902439024390244, + "grad_norm": 0.5983547568321228, + "learning_rate": 0.0002, + "loss": 1.4857, + "step": 1760 + }, + { + "epoch": 3.9246119733924614, + "grad_norm": 0.6471580266952515, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 1770 + }, + { + "epoch": 3.9467849223946785, + "grad_norm": 0.5833685398101807, + "learning_rate": 0.0002, + "loss": 1.461, + "step": 1780 + }, + { + "epoch": 3.9689578713968956, + "grad_norm": 0.5509327054023743, + "learning_rate": 0.0002, + "loss": 1.5014, + "step": 1790 + }, + { + "epoch": 3.991130820399113, + "grad_norm": 0.6021352410316467, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 1800 + }, + { + "epoch": 4.0, + "eval_loss": 1.901047945022583, + "eval_runtime": 82.2708, + "eval_samples_per_second": 6.26, + "eval_steps_per_second": 0.79, + "step": 1804 + }, + { + "epoch": 4.013303769401331, + "grad_norm": 0.6232016682624817, + "learning_rate": 0.0002, + "loss": 1.422, + "step": 1810 + }, + { + "epoch": 4.035476718403547, + "grad_norm": 0.7521207928657532, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 1820 + }, + { + "epoch": 4.057649667405765, + "grad_norm": 0.7839062213897705, + "learning_rate": 0.0002, + "loss": 1.4481, + "step": 1830 + }, + { + "epoch": 4.0798226164079825, + "grad_norm": 0.8654165863990784, + "learning_rate": 0.0002, + "loss": 1.4147, + "step": 1840 + }, + { + "epoch": 4.101995565410199, + "grad_norm": 0.6872738599777222, + "learning_rate": 0.0002, + "loss": 1.2983, + "step": 1850 + }, + { + "epoch": 4.124168514412417, + "grad_norm": 0.7529677748680115, + "learning_rate": 0.0002, + "loss": 1.3115, + "step": 1860 + }, + { + "epoch": 4.146341463414634, + "grad_norm": 0.835027277469635, + "learning_rate": 0.0002, + "loss": 1.3869, + "step": 1870 + }, + { + "epoch": 4.168514412416852, + "grad_norm": 0.7457721829414368, + "learning_rate": 0.0002, + "loss": 1.3273, + "step": 1880 + }, + { + "epoch": 4.1906873614190685, + "grad_norm": 0.7366040349006653, + "learning_rate": 0.0002, + "loss": 1.2893, + "step": 1890 + }, + { + "epoch": 4.212860310421286, + "grad_norm": 0.7802833914756775, + "learning_rate": 0.0002, + "loss": 1.3615, + "step": 1900 + }, + { + "epoch": 4.235033259423504, + "grad_norm": 0.7526614665985107, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 1910 + }, + { + "epoch": 4.25720620842572, + "grad_norm": 0.7531310319900513, + "learning_rate": 0.0002, + "loss": 1.4384, + "step": 1920 + }, + { + "epoch": 4.279379157427938, + "grad_norm": 0.8899626135826111, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 1930 + }, + { + "epoch": 4.301552106430155, + "grad_norm": 0.7591356635093689, + "learning_rate": 0.0002, + "loss": 1.328, + "step": 1940 + }, + { + "epoch": 4.323725055432373, + "grad_norm": 0.7126884460449219, + "learning_rate": 0.0002, + "loss": 1.4114, + "step": 1950 + }, + { + "epoch": 4.34589800443459, + "grad_norm": 0.7907777428627014, + "learning_rate": 0.0002, + "loss": 1.4259, + "step": 1960 + }, + { + "epoch": 4.368070953436807, + "grad_norm": 0.7854869961738586, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 1970 + }, + { + "epoch": 4.390243902439025, + "grad_norm": 0.6982123851776123, + "learning_rate": 0.0002, + "loss": 1.4126, + "step": 1980 + }, + { + "epoch": 4.412416851441241, + "grad_norm": 0.7551925182342529, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 1990 + }, + { + "epoch": 4.434589800443459, + "grad_norm": 0.864078164100647, + "learning_rate": 0.0002, + "loss": 1.4551, + "step": 2000 + }, + { + "epoch": 4.4567627494456765, + "grad_norm": 0.8406776189804077, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 2010 + }, + { + "epoch": 4.478935698447893, + "grad_norm": 0.7706766724586487, + "learning_rate": 0.0002, + "loss": 1.3543, + "step": 2020 + }, + { + "epoch": 4.501108647450111, + "grad_norm": 0.7703949213027954, + "learning_rate": 0.0002, + "loss": 1.386, + "step": 2030 + }, + { + "epoch": 4.523281596452328, + "grad_norm": 0.8654166460037231, + "learning_rate": 0.0002, + "loss": 1.4059, + "step": 2040 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.7800114750862122, + "learning_rate": 0.0002, + "loss": 1.4067, + "step": 2050 + }, + { + "epoch": 4.5676274944567625, + "grad_norm": 0.7553898692131042, + "learning_rate": 0.0002, + "loss": 1.3578, + "step": 2060 + }, + { + "epoch": 4.58980044345898, + "grad_norm": 0.8689188957214355, + "learning_rate": 0.0002, + "loss": 1.3845, + "step": 2070 + }, + { + "epoch": 4.611973392461198, + "grad_norm": 0.7244092226028442, + "learning_rate": 0.0002, + "loss": 1.3851, + "step": 2080 + }, + { + "epoch": 4.634146341463414, + "grad_norm": 0.9829743504524231, + "learning_rate": 0.0002, + "loss": 1.3627, + "step": 2090 + }, + { + "epoch": 4.656319290465632, + "grad_norm": 0.8026102185249329, + "learning_rate": 0.0002, + "loss": 1.4059, + "step": 2100 + }, + { + "epoch": 4.678492239467849, + "grad_norm": 0.6725143194198608, + "learning_rate": 0.0002, + "loss": 1.3676, + "step": 2110 + }, + { + "epoch": 4.700665188470067, + "grad_norm": 0.8055245876312256, + "learning_rate": 0.0002, + "loss": 1.4669, + "step": 2120 + }, + { + "epoch": 4.722838137472284, + "grad_norm": 0.7507025003433228, + "learning_rate": 0.0002, + "loss": 1.4455, + "step": 2130 + }, + { + "epoch": 4.745011086474501, + "grad_norm": 0.7166216969490051, + "learning_rate": 0.0002, + "loss": 1.3974, + "step": 2140 + }, + { + "epoch": 4.767184035476719, + "grad_norm": 0.6826853156089783, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 2150 + }, + { + "epoch": 4.789356984478935, + "grad_norm": 1.1347891092300415, + "learning_rate": 0.0002, + "loss": 1.3907, + "step": 2160 + }, + { + "epoch": 4.811529933481153, + "grad_norm": 0.8205971121788025, + "learning_rate": 0.0002, + "loss": 1.3737, + "step": 2170 + }, + { + "epoch": 4.8337028824833705, + "grad_norm": 0.7861950397491455, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 2180 + }, + { + "epoch": 4.855875831485587, + "grad_norm": 0.839460551738739, + "learning_rate": 0.0002, + "loss": 1.4293, + "step": 2190 + }, + { + "epoch": 4.878048780487805, + "grad_norm": 0.746583878993988, + "learning_rate": 0.0002, + "loss": 1.3881, + "step": 2200 + }, + { + "epoch": 4.900221729490022, + "grad_norm": 0.7805684804916382, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 2210 + }, + { + "epoch": 4.922394678492239, + "grad_norm": 0.8079700469970703, + "learning_rate": 0.0002, + "loss": 1.4053, + "step": 2220 + }, + { + "epoch": 4.9445676274944566, + "grad_norm": 0.7609502673149109, + "learning_rate": 0.0002, + "loss": 1.353, + "step": 2230 + }, + { + "epoch": 4.966740576496674, + "grad_norm": 0.7862996459007263, + "learning_rate": 0.0002, + "loss": 1.3816, + "step": 2240 + }, + { + "epoch": 4.988913525498892, + "grad_norm": 0.778677225112915, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 2250 + }, + { + "epoch": 5.0, + "eval_loss": 1.9658271074295044, + "eval_runtime": 108.3717, + "eval_samples_per_second": 4.752, + "eval_steps_per_second": 0.6, + "step": 2255 + } + ], + "logging_steps": 10, + "max_steps": 3608, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0435635085049856e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcdc3b48752889d03e0bc8f748c28d3a3eeb1026 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2255/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4002b3199d379e5c6c8494417ad0ab221d2f73797ae8bbbb3b91f811953dbe5c +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aaeecb9a346241fc87abc28e133715ac6443ac61 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8078eb1d5c74b3b9a3aa95805fcb5c96a944449e7fc56556379629c8b28f990 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c35abd29d1cc21dfb447571a21132c3e1a858d25 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42a8f841236f1ae4bb347cab3665945322fd4af0b984fea84a8cd18fab754d4d +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8a224b9f9875a329adff8dcc3c1d916a49196732 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93be04fddde882abfa7da71e1d08a4abd012272c59a981cdfbd9704e67291b66 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..94057976f72aafb1ee24cf9e3588afd69c8dd866 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dac72b7636e20e2ee2387e6d6200ecf06879769471bf3f9c71f7ef8f7d7397a +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1822feeaad43b4b4206846461e9b8259599bf40a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/trainer_state.json @@ -0,0 +1,1971 @@ +{ + "best_metric": 1.8243104219436646, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 2706, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022172949002217297, + "grad_norm": 0.6454975008964539, + "learning_rate": 0.0002, + "loss": 2.684, + "step": 10 + }, + { + "epoch": 0.04434589800443459, + "grad_norm": 0.5452715158462524, + "learning_rate": 0.0002, + "loss": 2.2694, + "step": 20 + }, + { + "epoch": 0.06651884700665188, + "grad_norm": 0.5502195358276367, + "learning_rate": 0.0002, + "loss": 2.0796, + "step": 30 + }, + { + "epoch": 0.08869179600886919, + "grad_norm": 0.48551198840141296, + "learning_rate": 0.0002, + "loss": 1.9132, + "step": 40 + }, + { + "epoch": 0.11086474501108648, + "grad_norm": 0.47822514176368713, + "learning_rate": 0.0002, + "loss": 2.016, + "step": 50 + }, + { + "epoch": 0.13303769401330376, + "grad_norm": 0.5125395655632019, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 60 + }, + { + "epoch": 0.15521064301552107, + "grad_norm": 0.4600693881511688, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 70 + }, + { + "epoch": 0.17738359201773837, + "grad_norm": 0.7180814743041992, + "learning_rate": 0.0002, + "loss": 2.0057, + "step": 80 + }, + { + "epoch": 0.19955654101995565, + "grad_norm": 0.4712974429130554, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 90 + }, + { + "epoch": 0.22172949002217296, + "grad_norm": 0.4673261344432831, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 100 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 0.4129070043563843, + "learning_rate": 0.0002, + "loss": 1.9346, + "step": 110 + }, + { + "epoch": 0.2660753880266075, + "grad_norm": 0.3859104812145233, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 120 + }, + { + "epoch": 0.28824833702882485, + "grad_norm": 0.40966713428497314, + "learning_rate": 0.0002, + "loss": 1.8922, + "step": 130 + }, + { + "epoch": 0.31042128603104213, + "grad_norm": 0.3685867488384247, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 140 + }, + { + "epoch": 0.3325942350332594, + "grad_norm": 0.39279988408088684, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 150 + }, + { + "epoch": 0.35476718403547675, + "grad_norm": 0.4195398986339569, + "learning_rate": 0.0002, + "loss": 1.8556, + "step": 160 + }, + { + "epoch": 0.376940133037694, + "grad_norm": 0.469802588224411, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 170 + }, + { + "epoch": 0.3991130820399113, + "grad_norm": 0.4069509208202362, + "learning_rate": 0.0002, + "loss": 1.8135, + "step": 180 + }, + { + "epoch": 0.4212860310421286, + "grad_norm": 0.47832027077674866, + "learning_rate": 0.0002, + "loss": 1.8429, + "step": 190 + }, + { + "epoch": 0.4434589800443459, + "grad_norm": 0.3376411199569702, + "learning_rate": 0.0002, + "loss": 1.781, + "step": 200 + }, + { + "epoch": 0.4656319290465632, + "grad_norm": 0.3787185847759247, + "learning_rate": 0.0002, + "loss": 1.8562, + "step": 210 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 0.40322697162628174, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 220 + }, + { + "epoch": 0.5099778270509978, + "grad_norm": 0.3710436522960663, + "learning_rate": 0.0002, + "loss": 1.8007, + "step": 230 + }, + { + "epoch": 0.532150776053215, + "grad_norm": 0.3723200261592865, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 240 + }, + { + "epoch": 0.5543237250554324, + "grad_norm": 0.3457179069519043, + "learning_rate": 0.0002, + "loss": 1.852, + "step": 250 + }, + { + "epoch": 0.5764966740576497, + "grad_norm": 0.35369473695755005, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 260 + }, + { + "epoch": 0.5986696230598669, + "grad_norm": 0.3667483329772949, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 270 + }, + { + "epoch": 0.6208425720620843, + "grad_norm": 0.4023273289203644, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 280 + }, + { + "epoch": 0.6430155210643016, + "grad_norm": 0.3601929843425751, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 290 + }, + { + "epoch": 0.6651884700665188, + "grad_norm": 0.32610392570495605, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 300 + }, + { + "epoch": 0.6873614190687362, + "grad_norm": 0.40528756380081177, + "learning_rate": 0.0002, + "loss": 1.8412, + "step": 310 + }, + { + "epoch": 0.7095343680709535, + "grad_norm": 0.34639739990234375, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 320 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 0.3794991374015808, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 330 + }, + { + "epoch": 0.753880266075388, + "grad_norm": 0.34203875064849854, + "learning_rate": 0.0002, + "loss": 1.8678, + "step": 340 + }, + { + "epoch": 0.7760532150776053, + "grad_norm": 0.36692821979522705, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 350 + }, + { + "epoch": 0.7982261640798226, + "grad_norm": 0.3701125979423523, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 360 + }, + { + "epoch": 0.8203991130820399, + "grad_norm": 0.3971416652202606, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 370 + }, + { + "epoch": 0.8425720620842572, + "grad_norm": 0.3751989006996155, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 380 + }, + { + "epoch": 0.8647450110864745, + "grad_norm": 0.35116496682167053, + "learning_rate": 0.0002, + "loss": 1.8281, + "step": 390 + }, + { + "epoch": 0.8869179600886918, + "grad_norm": 0.3672674894332886, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 400 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.34648260474205017, + "learning_rate": 0.0002, + "loss": 1.8293, + "step": 410 + }, + { + "epoch": 0.9312638580931264, + "grad_norm": 0.4497389793395996, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 420 + }, + { + "epoch": 0.9534368070953437, + "grad_norm": 0.33595147728919983, + "learning_rate": 0.0002, + "loss": 1.8405, + "step": 430 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 0.3130456805229187, + "learning_rate": 0.0002, + "loss": 1.8137, + "step": 440 + }, + { + "epoch": 0.9977827050997783, + "grad_norm": 0.36480239033699036, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8323718309402466, + "eval_runtime": 79.9603, + "eval_samples_per_second": 6.441, + "eval_steps_per_second": 0.813, + "step": 451 + }, + { + "epoch": 1.0199556541019956, + "grad_norm": 0.3840029835700989, + "learning_rate": 0.0002, + "loss": 1.7921, + "step": 460 + }, + { + "epoch": 1.042128603104213, + "grad_norm": 0.33457425236701965, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 470 + }, + { + "epoch": 1.06430155210643, + "grad_norm": 0.35766592621803284, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 480 + }, + { + "epoch": 1.0864745011086474, + "grad_norm": 0.38070937991142273, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 490 + }, + { + "epoch": 1.1086474501108647, + "grad_norm": 0.38546547293663025, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 500 + }, + { + "epoch": 1.130820399113082, + "grad_norm": 0.384104460477829, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 510 + }, + { + "epoch": 1.1529933481152994, + "grad_norm": 0.3556116819381714, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 520 + }, + { + "epoch": 1.1751662971175167, + "grad_norm": 0.4110541343688965, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 530 + }, + { + "epoch": 1.1973392461197339, + "grad_norm": 0.46503177285194397, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 540 + }, + { + "epoch": 1.2195121951219512, + "grad_norm": 0.4366816580295563, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 550 + }, + { + "epoch": 1.2416851441241685, + "grad_norm": 0.379986047744751, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 560 + }, + { + "epoch": 1.2638580931263859, + "grad_norm": 0.3920869529247284, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 570 + }, + { + "epoch": 1.2860310421286032, + "grad_norm": 0.4013986587524414, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 580 + }, + { + "epoch": 1.3082039911308203, + "grad_norm": 0.39104390144348145, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 590 + }, + { + "epoch": 1.3303769401330376, + "grad_norm": 0.40515613555908203, + "learning_rate": 0.0002, + "loss": 1.7822, + "step": 600 + }, + { + "epoch": 1.352549889135255, + "grad_norm": 0.4212331473827362, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 610 + }, + { + "epoch": 1.3747228381374723, + "grad_norm": 0.36040815711021423, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 620 + }, + { + "epoch": 1.3968957871396896, + "grad_norm": 0.3950865864753723, + "learning_rate": 0.0002, + "loss": 1.7467, + "step": 630 + }, + { + "epoch": 1.4190687361419068, + "grad_norm": 0.3934709131717682, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 640 + }, + { + "epoch": 1.441241685144124, + "grad_norm": 0.3905350863933563, + "learning_rate": 0.0002, + "loss": 1.7783, + "step": 650 + }, + { + "epoch": 1.4634146341463414, + "grad_norm": 0.4322686493396759, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 660 + }, + { + "epoch": 1.4855875831485588, + "grad_norm": 0.35697034001350403, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 670 + }, + { + "epoch": 1.507760532150776, + "grad_norm": 0.38570451736450195, + "learning_rate": 0.0002, + "loss": 1.6764, + "step": 680 + }, + { + "epoch": 1.5299334811529932, + "grad_norm": 0.3804517090320587, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 690 + }, + { + "epoch": 1.5521064301552108, + "grad_norm": 0.4938165247440338, + "learning_rate": 0.0002, + "loss": 1.7725, + "step": 700 + }, + { + "epoch": 1.5742793791574279, + "grad_norm": 0.43075236678123474, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 710 + }, + { + "epoch": 1.5964523281596452, + "grad_norm": 0.40434643626213074, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 720 + }, + { + "epoch": 1.6186252771618626, + "grad_norm": 0.3874157667160034, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 730 + }, + { + "epoch": 1.6407982261640797, + "grad_norm": 0.3645969331264496, + "learning_rate": 0.0002, + "loss": 1.8141, + "step": 740 + }, + { + "epoch": 1.6629711751662972, + "grad_norm": 0.38588255643844604, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 750 + }, + { + "epoch": 1.6851441241685143, + "grad_norm": 0.39252519607543945, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 760 + }, + { + "epoch": 1.7073170731707317, + "grad_norm": 0.36165162920951843, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 770 + }, + { + "epoch": 1.729490022172949, + "grad_norm": 0.36677947640419006, + "learning_rate": 0.0002, + "loss": 1.7172, + "step": 780 + }, + { + "epoch": 1.7516629711751663, + "grad_norm": 0.374881774187088, + "learning_rate": 0.0002, + "loss": 1.7737, + "step": 790 + }, + { + "epoch": 1.7738359201773837, + "grad_norm": 0.4530802369117737, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 800 + }, + { + "epoch": 1.7960088691796008, + "grad_norm": 0.3879568576812744, + "learning_rate": 0.0002, + "loss": 1.7799, + "step": 810 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.3710079789161682, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 820 + }, + { + "epoch": 1.8403547671840355, + "grad_norm": 0.3831799030303955, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 830 + }, + { + "epoch": 1.8625277161862528, + "grad_norm": 0.3958432376384735, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 840 + }, + { + "epoch": 1.8847006651884701, + "grad_norm": 0.4129294157028198, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 850 + }, + { + "epoch": 1.9068736141906872, + "grad_norm": 0.3714745044708252, + "learning_rate": 0.0002, + "loss": 1.6506, + "step": 860 + }, + { + "epoch": 1.9290465631929048, + "grad_norm": 0.40176868438720703, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 870 + }, + { + "epoch": 1.951219512195122, + "grad_norm": 0.36937767267227173, + "learning_rate": 0.0002, + "loss": 1.7557, + "step": 880 + }, + { + "epoch": 1.9733924611973392, + "grad_norm": 0.40242597460746765, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 890 + }, + { + "epoch": 1.9955654101995566, + "grad_norm": 0.3515510559082031, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 900 + }, + { + "epoch": 2.0, + "eval_loss": 1.8243104219436646, + "eval_runtime": 107.8856, + "eval_samples_per_second": 4.774, + "eval_steps_per_second": 0.602, + "step": 902 + }, + { + "epoch": 2.0177383592017737, + "grad_norm": 0.4145216643810272, + "learning_rate": 0.0002, + "loss": 1.6631, + "step": 910 + }, + { + "epoch": 2.0399113082039912, + "grad_norm": 0.42423519492149353, + "learning_rate": 0.0002, + "loss": 1.7227, + "step": 920 + }, + { + "epoch": 2.0620842572062084, + "grad_norm": 0.4773229956626892, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 930 + }, + { + "epoch": 2.084257206208426, + "grad_norm": 0.4144791066646576, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 940 + }, + { + "epoch": 2.106430155210643, + "grad_norm": 0.42704132199287415, + "learning_rate": 0.0002, + "loss": 1.6433, + "step": 950 + }, + { + "epoch": 2.12860310421286, + "grad_norm": 0.4479042589664459, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 960 + }, + { + "epoch": 2.1507760532150777, + "grad_norm": 0.4810638129711151, + "learning_rate": 0.0002, + "loss": 1.6122, + "step": 970 + }, + { + "epoch": 2.172949002217295, + "grad_norm": 0.48669910430908203, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 980 + }, + { + "epoch": 2.1951219512195124, + "grad_norm": 0.4252761900424957, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 990 + }, + { + "epoch": 2.2172949002217295, + "grad_norm": 0.42342790961265564, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 1000 + }, + { + "epoch": 2.2394678492239466, + "grad_norm": 0.43432456254959106, + "learning_rate": 0.0002, + "loss": 1.637, + "step": 1010 + }, + { + "epoch": 2.261640798226164, + "grad_norm": 0.45556965470314026, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 1020 + }, + { + "epoch": 2.2838137472283813, + "grad_norm": 0.48035719990730286, + "learning_rate": 0.0002, + "loss": 1.6554, + "step": 1030 + }, + { + "epoch": 2.305986696230599, + "grad_norm": 0.4233241081237793, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 1040 + }, + { + "epoch": 2.328159645232816, + "grad_norm": 0.3918434679508209, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 1050 + }, + { + "epoch": 2.3503325942350335, + "grad_norm": 0.44049757719039917, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 1060 + }, + { + "epoch": 2.3725055432372506, + "grad_norm": 0.4730056822299957, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 1070 + }, + { + "epoch": 2.3946784922394677, + "grad_norm": 0.4354589581489563, + "learning_rate": 0.0002, + "loss": 1.6104, + "step": 1080 + }, + { + "epoch": 2.4168514412416853, + "grad_norm": 0.4837590456008911, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 1090 + }, + { + "epoch": 2.4390243902439024, + "grad_norm": 0.4842571020126343, + "learning_rate": 0.0002, + "loss": 1.6672, + "step": 1100 + }, + { + "epoch": 2.4611973392461195, + "grad_norm": 0.46398279070854187, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 1110 + }, + { + "epoch": 2.483370288248337, + "grad_norm": 0.4587327539920807, + "learning_rate": 0.0002, + "loss": 1.6393, + "step": 1120 + }, + { + "epoch": 2.505543237250554, + "grad_norm": 0.4336528480052948, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 1130 + }, + { + "epoch": 2.5277161862527717, + "grad_norm": 0.6162153482437134, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1140 + }, + { + "epoch": 2.549889135254989, + "grad_norm": 0.48175573348999023, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1150 + }, + { + "epoch": 2.5720620842572064, + "grad_norm": 0.448272705078125, + "learning_rate": 0.0002, + "loss": 1.6098, + "step": 1160 + }, + { + "epoch": 2.5942350332594235, + "grad_norm": 0.5189200639724731, + "learning_rate": 0.0002, + "loss": 1.6987, + "step": 1170 + }, + { + "epoch": 2.6164079822616406, + "grad_norm": 0.45032963156700134, + "learning_rate": 0.0002, + "loss": 1.6503, + "step": 1180 + }, + { + "epoch": 2.638580931263858, + "grad_norm": 0.4417729377746582, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 1190 + }, + { + "epoch": 2.6607538802660753, + "grad_norm": 0.5219636559486389, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 1200 + }, + { + "epoch": 2.682926829268293, + "grad_norm": 0.47702011466026306, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 1210 + }, + { + "epoch": 2.70509977827051, + "grad_norm": 0.4328458607196808, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1220 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.46762076020240784, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 1230 + }, + { + "epoch": 2.7494456762749446, + "grad_norm": 0.4592697322368622, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 1240 + }, + { + "epoch": 2.7716186252771617, + "grad_norm": 0.5519265532493591, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 1250 + }, + { + "epoch": 2.7937915742793793, + "grad_norm": 0.47169506549835205, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1260 + }, + { + "epoch": 2.8159645232815964, + "grad_norm": 0.47231653332710266, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 1270 + }, + { + "epoch": 2.8381374722838135, + "grad_norm": 0.49081969261169434, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 1280 + }, + { + "epoch": 2.860310421286031, + "grad_norm": 0.4483231008052826, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 1290 + }, + { + "epoch": 2.882483370288248, + "grad_norm": 0.5310035943984985, + "learning_rate": 0.0002, + "loss": 1.6428, + "step": 1300 + }, + { + "epoch": 2.9046563192904657, + "grad_norm": 0.4419795572757721, + "learning_rate": 0.0002, + "loss": 1.6515, + "step": 1310 + }, + { + "epoch": 2.926829268292683, + "grad_norm": 0.44630762934684753, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 1320 + }, + { + "epoch": 2.9490022172949004, + "grad_norm": 0.39774850010871887, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 1330 + }, + { + "epoch": 2.9711751662971175, + "grad_norm": 0.441727876663208, + "learning_rate": 0.0002, + "loss": 1.7007, + "step": 1340 + }, + { + "epoch": 2.9933481152993346, + "grad_norm": 0.43773892521858215, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 1350 + }, + { + "epoch": 3.0, + "eval_loss": 1.8514760732650757, + "eval_runtime": 131.9812, + "eval_samples_per_second": 3.902, + "eval_steps_per_second": 0.492, + "step": 1353 + }, + { + "epoch": 3.015521064301552, + "grad_norm": 0.5274150371551514, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 1360 + }, + { + "epoch": 3.0376940133037693, + "grad_norm": 0.5724489092826843, + "learning_rate": 0.0002, + "loss": 1.4784, + "step": 1370 + }, + { + "epoch": 3.059866962305987, + "grad_norm": 0.6182316541671753, + "learning_rate": 0.0002, + "loss": 1.5365, + "step": 1380 + }, + { + "epoch": 3.082039911308204, + "grad_norm": 0.5709688067436218, + "learning_rate": 0.0002, + "loss": 1.4824, + "step": 1390 + }, + { + "epoch": 3.104212860310421, + "grad_norm": 0.6368464231491089, + "learning_rate": 0.0002, + "loss": 1.534, + "step": 1400 + }, + { + "epoch": 3.1263858093126387, + "grad_norm": 0.5680432319641113, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 1410 + }, + { + "epoch": 3.1485587583148558, + "grad_norm": 0.5805315375328064, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 1420 + }, + { + "epoch": 3.1707317073170733, + "grad_norm": 0.5782836675643921, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 1430 + }, + { + "epoch": 3.1929046563192904, + "grad_norm": 0.627159595489502, + "learning_rate": 0.0002, + "loss": 1.4852, + "step": 1440 + }, + { + "epoch": 3.2150776053215075, + "grad_norm": 0.6136751174926758, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 1450 + }, + { + "epoch": 3.237250554323725, + "grad_norm": 0.6319093108177185, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 1460 + }, + { + "epoch": 3.259423503325942, + "grad_norm": 0.7641780972480774, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 1470 + }, + { + "epoch": 3.2815964523281598, + "grad_norm": 0.6116001605987549, + "learning_rate": 0.0002, + "loss": 1.5514, + "step": 1480 + }, + { + "epoch": 3.303769401330377, + "grad_norm": 0.6024722456932068, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 1490 + }, + { + "epoch": 3.3259423503325944, + "grad_norm": 0.5941570997238159, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 1500 + }, + { + "epoch": 3.3481152993348116, + "grad_norm": 0.608369767665863, + "learning_rate": 0.0002, + "loss": 1.5104, + "step": 1510 + }, + { + "epoch": 3.3702882483370287, + "grad_norm": 0.5942065715789795, + "learning_rate": 0.0002, + "loss": 1.5494, + "step": 1520 + }, + { + "epoch": 3.3924611973392462, + "grad_norm": 0.6382330656051636, + "learning_rate": 0.0002, + "loss": 1.5426, + "step": 1530 + }, + { + "epoch": 3.4146341463414633, + "grad_norm": 0.5839648842811584, + "learning_rate": 0.0002, + "loss": 1.5479, + "step": 1540 + }, + { + "epoch": 3.436807095343681, + "grad_norm": 0.5627358555793762, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 1550 + }, + { + "epoch": 3.458980044345898, + "grad_norm": 0.6342151761054993, + "learning_rate": 0.0002, + "loss": 1.5679, + "step": 1560 + }, + { + "epoch": 3.481152993348115, + "grad_norm": 0.6370542645454407, + "learning_rate": 0.0002, + "loss": 1.5005, + "step": 1570 + }, + { + "epoch": 3.5033259423503327, + "grad_norm": 0.5974680185317993, + "learning_rate": 0.0002, + "loss": 1.541, + "step": 1580 + }, + { + "epoch": 3.52549889135255, + "grad_norm": 0.6197021007537842, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 1590 + }, + { + "epoch": 3.5476718403547673, + "grad_norm": 0.6413024067878723, + "learning_rate": 0.0002, + "loss": 1.5287, + "step": 1600 + }, + { + "epoch": 3.5698447893569845, + "grad_norm": 0.5878410339355469, + "learning_rate": 0.0002, + "loss": 1.5301, + "step": 1610 + }, + { + "epoch": 3.5920177383592016, + "grad_norm": 0.6485083103179932, + "learning_rate": 0.0002, + "loss": 1.4625, + "step": 1620 + }, + { + "epoch": 3.614190687361419, + "grad_norm": 0.5826634764671326, + "learning_rate": 0.0002, + "loss": 1.5373, + "step": 1630 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.8906663656234741, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 1640 + }, + { + "epoch": 3.658536585365854, + "grad_norm": 0.6288479566574097, + "learning_rate": 0.0002, + "loss": 1.5208, + "step": 1650 + }, + { + "epoch": 3.680709534368071, + "grad_norm": 0.6191049218177795, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 1660 + }, + { + "epoch": 3.7028824833702885, + "grad_norm": 0.5997978448867798, + "learning_rate": 0.0002, + "loss": 1.5043, + "step": 1670 + }, + { + "epoch": 3.7250554323725056, + "grad_norm": 0.6003038287162781, + "learning_rate": 0.0002, + "loss": 1.5654, + "step": 1680 + }, + { + "epoch": 3.7472283813747227, + "grad_norm": 0.5417194962501526, + "learning_rate": 0.0002, + "loss": 1.4941, + "step": 1690 + }, + { + "epoch": 3.7694013303769403, + "grad_norm": 0.6367442607879639, + "learning_rate": 0.0002, + "loss": 1.5541, + "step": 1700 + }, + { + "epoch": 3.7915742793791574, + "grad_norm": 0.6613120436668396, + "learning_rate": 0.0002, + "loss": 1.5483, + "step": 1710 + }, + { + "epoch": 3.8137472283813745, + "grad_norm": 0.6506749391555786, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 1720 + }, + { + "epoch": 3.835920177383592, + "grad_norm": 0.5478500723838806, + "learning_rate": 0.0002, + "loss": 1.5207, + "step": 1730 + }, + { + "epoch": 3.858093126385809, + "grad_norm": 0.7313215732574463, + "learning_rate": 0.0002, + "loss": 1.5619, + "step": 1740 + }, + { + "epoch": 3.8802660753880267, + "grad_norm": 0.5453857183456421, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 1750 + }, + { + "epoch": 3.902439024390244, + "grad_norm": 0.5983547568321228, + "learning_rate": 0.0002, + "loss": 1.4857, + "step": 1760 + }, + { + "epoch": 3.9246119733924614, + "grad_norm": 0.6471580266952515, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 1770 + }, + { + "epoch": 3.9467849223946785, + "grad_norm": 0.5833685398101807, + "learning_rate": 0.0002, + "loss": 1.461, + "step": 1780 + }, + { + "epoch": 3.9689578713968956, + "grad_norm": 0.5509327054023743, + "learning_rate": 0.0002, + "loss": 1.5014, + "step": 1790 + }, + { + "epoch": 3.991130820399113, + "grad_norm": 0.6021352410316467, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 1800 + }, + { + "epoch": 4.0, + "eval_loss": 1.901047945022583, + "eval_runtime": 82.2708, + "eval_samples_per_second": 6.26, + "eval_steps_per_second": 0.79, + "step": 1804 + }, + { + "epoch": 4.013303769401331, + "grad_norm": 0.6232016682624817, + "learning_rate": 0.0002, + "loss": 1.422, + "step": 1810 + }, + { + "epoch": 4.035476718403547, + "grad_norm": 0.7521207928657532, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 1820 + }, + { + "epoch": 4.057649667405765, + "grad_norm": 0.7839062213897705, + "learning_rate": 0.0002, + "loss": 1.4481, + "step": 1830 + }, + { + "epoch": 4.0798226164079825, + "grad_norm": 0.8654165863990784, + "learning_rate": 0.0002, + "loss": 1.4147, + "step": 1840 + }, + { + "epoch": 4.101995565410199, + "grad_norm": 0.6872738599777222, + "learning_rate": 0.0002, + "loss": 1.2983, + "step": 1850 + }, + { + "epoch": 4.124168514412417, + "grad_norm": 0.7529677748680115, + "learning_rate": 0.0002, + "loss": 1.3115, + "step": 1860 + }, + { + "epoch": 4.146341463414634, + "grad_norm": 0.835027277469635, + "learning_rate": 0.0002, + "loss": 1.3869, + "step": 1870 + }, + { + "epoch": 4.168514412416852, + "grad_norm": 0.7457721829414368, + "learning_rate": 0.0002, + "loss": 1.3273, + "step": 1880 + }, + { + "epoch": 4.1906873614190685, + "grad_norm": 0.7366040349006653, + "learning_rate": 0.0002, + "loss": 1.2893, + "step": 1890 + }, + { + "epoch": 4.212860310421286, + "grad_norm": 0.7802833914756775, + "learning_rate": 0.0002, + "loss": 1.3615, + "step": 1900 + }, + { + "epoch": 4.235033259423504, + "grad_norm": 0.7526614665985107, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 1910 + }, + { + "epoch": 4.25720620842572, + "grad_norm": 0.7531310319900513, + "learning_rate": 0.0002, + "loss": 1.4384, + "step": 1920 + }, + { + "epoch": 4.279379157427938, + "grad_norm": 0.8899626135826111, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 1930 + }, + { + "epoch": 4.301552106430155, + "grad_norm": 0.7591356635093689, + "learning_rate": 0.0002, + "loss": 1.328, + "step": 1940 + }, + { + "epoch": 4.323725055432373, + "grad_norm": 0.7126884460449219, + "learning_rate": 0.0002, + "loss": 1.4114, + "step": 1950 + }, + { + "epoch": 4.34589800443459, + "grad_norm": 0.7907777428627014, + "learning_rate": 0.0002, + "loss": 1.4259, + "step": 1960 + }, + { + "epoch": 4.368070953436807, + "grad_norm": 0.7854869961738586, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 1970 + }, + { + "epoch": 4.390243902439025, + "grad_norm": 0.6982123851776123, + "learning_rate": 0.0002, + "loss": 1.4126, + "step": 1980 + }, + { + "epoch": 4.412416851441241, + "grad_norm": 0.7551925182342529, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 1990 + }, + { + "epoch": 4.434589800443459, + "grad_norm": 0.864078164100647, + "learning_rate": 0.0002, + "loss": 1.4551, + "step": 2000 + }, + { + "epoch": 4.4567627494456765, + "grad_norm": 0.8406776189804077, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 2010 + }, + { + "epoch": 4.478935698447893, + "grad_norm": 0.7706766724586487, + "learning_rate": 0.0002, + "loss": 1.3543, + "step": 2020 + }, + { + "epoch": 4.501108647450111, + "grad_norm": 0.7703949213027954, + "learning_rate": 0.0002, + "loss": 1.386, + "step": 2030 + }, + { + "epoch": 4.523281596452328, + "grad_norm": 0.8654166460037231, + "learning_rate": 0.0002, + "loss": 1.4059, + "step": 2040 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.7800114750862122, + "learning_rate": 0.0002, + "loss": 1.4067, + "step": 2050 + }, + { + "epoch": 4.5676274944567625, + "grad_norm": 0.7553898692131042, + "learning_rate": 0.0002, + "loss": 1.3578, + "step": 2060 + }, + { + "epoch": 4.58980044345898, + "grad_norm": 0.8689188957214355, + "learning_rate": 0.0002, + "loss": 1.3845, + "step": 2070 + }, + { + "epoch": 4.611973392461198, + "grad_norm": 0.7244092226028442, + "learning_rate": 0.0002, + "loss": 1.3851, + "step": 2080 + }, + { + "epoch": 4.634146341463414, + "grad_norm": 0.9829743504524231, + "learning_rate": 0.0002, + "loss": 1.3627, + "step": 2090 + }, + { + "epoch": 4.656319290465632, + "grad_norm": 0.8026102185249329, + "learning_rate": 0.0002, + "loss": 1.4059, + "step": 2100 + }, + { + "epoch": 4.678492239467849, + "grad_norm": 0.6725143194198608, + "learning_rate": 0.0002, + "loss": 1.3676, + "step": 2110 + }, + { + "epoch": 4.700665188470067, + "grad_norm": 0.8055245876312256, + "learning_rate": 0.0002, + "loss": 1.4669, + "step": 2120 + }, + { + "epoch": 4.722838137472284, + "grad_norm": 0.7507025003433228, + "learning_rate": 0.0002, + "loss": 1.4455, + "step": 2130 + }, + { + "epoch": 4.745011086474501, + "grad_norm": 0.7166216969490051, + "learning_rate": 0.0002, + "loss": 1.3974, + "step": 2140 + }, + { + "epoch": 4.767184035476719, + "grad_norm": 0.6826853156089783, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 2150 + }, + { + "epoch": 4.789356984478935, + "grad_norm": 1.1347891092300415, + "learning_rate": 0.0002, + "loss": 1.3907, + "step": 2160 + }, + { + "epoch": 4.811529933481153, + "grad_norm": 0.8205971121788025, + "learning_rate": 0.0002, + "loss": 1.3737, + "step": 2170 + }, + { + "epoch": 4.8337028824833705, + "grad_norm": 0.7861950397491455, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 2180 + }, + { + "epoch": 4.855875831485587, + "grad_norm": 0.839460551738739, + "learning_rate": 0.0002, + "loss": 1.4293, + "step": 2190 + }, + { + "epoch": 4.878048780487805, + "grad_norm": 0.746583878993988, + "learning_rate": 0.0002, + "loss": 1.3881, + "step": 2200 + }, + { + "epoch": 4.900221729490022, + "grad_norm": 0.7805684804916382, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 2210 + }, + { + "epoch": 4.922394678492239, + "grad_norm": 0.8079700469970703, + "learning_rate": 0.0002, + "loss": 1.4053, + "step": 2220 + }, + { + "epoch": 4.9445676274944566, + "grad_norm": 0.7609502673149109, + "learning_rate": 0.0002, + "loss": 1.353, + "step": 2230 + }, + { + "epoch": 4.966740576496674, + "grad_norm": 0.7862996459007263, + "learning_rate": 0.0002, + "loss": 1.3816, + "step": 2240 + }, + { + "epoch": 4.988913525498892, + "grad_norm": 0.778677225112915, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 2250 + }, + { + "epoch": 5.0, + "eval_loss": 1.9658271074295044, + "eval_runtime": 108.3717, + "eval_samples_per_second": 4.752, + "eval_steps_per_second": 0.6, + "step": 2255 + }, + { + "epoch": 5.011086474501108, + "grad_norm": 0.7520418167114258, + "learning_rate": 0.0002, + "loss": 1.3395, + "step": 2260 + }, + { + "epoch": 5.033259423503326, + "grad_norm": 1.1831114292144775, + "learning_rate": 0.0002, + "loss": 1.1909, + "step": 2270 + }, + { + "epoch": 5.0554323725055434, + "grad_norm": 0.8718661069869995, + "learning_rate": 0.0002, + "loss": 1.1784, + "step": 2280 + }, + { + "epoch": 5.07760532150776, + "grad_norm": 1.0186705589294434, + "learning_rate": 0.0002, + "loss": 1.2208, + "step": 2290 + }, + { + "epoch": 5.099778270509978, + "grad_norm": 1.0370045900344849, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 2300 + }, + { + "epoch": 5.121951219512195, + "grad_norm": 0.9448253512382507, + "learning_rate": 0.0002, + "loss": 1.1485, + "step": 2310 + }, + { + "epoch": 5.144124168514413, + "grad_norm": 0.988973081111908, + "learning_rate": 0.0002, + "loss": 1.1764, + "step": 2320 + }, + { + "epoch": 5.1662971175166295, + "grad_norm": 0.9368142485618591, + "learning_rate": 0.0002, + "loss": 1.1544, + "step": 2330 + }, + { + "epoch": 5.188470066518847, + "grad_norm": 1.0289298295974731, + "learning_rate": 0.0002, + "loss": 1.2416, + "step": 2340 + }, + { + "epoch": 5.210643015521065, + "grad_norm": 0.9611420035362244, + "learning_rate": 0.0002, + "loss": 1.1982, + "step": 2350 + }, + { + "epoch": 5.232815964523281, + "grad_norm": 0.8490312099456787, + "learning_rate": 0.0002, + "loss": 1.2046, + "step": 2360 + }, + { + "epoch": 5.254988913525499, + "grad_norm": 1.0165891647338867, + "learning_rate": 0.0002, + "loss": 1.2504, + "step": 2370 + }, + { + "epoch": 5.277161862527716, + "grad_norm": 0.9902606010437012, + "learning_rate": 0.0002, + "loss": 1.2749, + "step": 2380 + }, + { + "epoch": 5.299334811529933, + "grad_norm": 0.987205445766449, + "learning_rate": 0.0002, + "loss": 1.2186, + "step": 2390 + }, + { + "epoch": 5.321507760532151, + "grad_norm": 0.7931132316589355, + "learning_rate": 0.0002, + "loss": 1.1962, + "step": 2400 + }, + { + "epoch": 5.343680709534368, + "grad_norm": 1.143110990524292, + "learning_rate": 0.0002, + "loss": 1.1661, + "step": 2410 + }, + { + "epoch": 5.365853658536586, + "grad_norm": 0.9869807362556458, + "learning_rate": 0.0002, + "loss": 1.191, + "step": 2420 + }, + { + "epoch": 5.388026607538802, + "grad_norm": 0.9835564494132996, + "learning_rate": 0.0002, + "loss": 1.208, + "step": 2430 + }, + { + "epoch": 5.41019955654102, + "grad_norm": 0.8321971893310547, + "learning_rate": 0.0002, + "loss": 1.2734, + "step": 2440 + }, + { + "epoch": 5.4323725055432375, + "grad_norm": 0.8379601240158081, + "learning_rate": 0.0002, + "loss": 1.2308, + "step": 2450 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 0.9872745871543884, + "learning_rate": 0.0002, + "loss": 1.2464, + "step": 2460 + }, + { + "epoch": 5.476718403547672, + "grad_norm": 0.9455783367156982, + "learning_rate": 0.0002, + "loss": 1.283, + "step": 2470 + }, + { + "epoch": 5.498891352549889, + "grad_norm": 0.9594705700874329, + "learning_rate": 0.0002, + "loss": 1.2585, + "step": 2480 + }, + { + "epoch": 5.521064301552107, + "grad_norm": 1.036603331565857, + "learning_rate": 0.0002, + "loss": 1.2776, + "step": 2490 + }, + { + "epoch": 5.5432372505543235, + "grad_norm": 1.0329008102416992, + "learning_rate": 0.0002, + "loss": 1.2346, + "step": 2500 + }, + { + "epoch": 5.565410199556541, + "grad_norm": 0.90513014793396, + "learning_rate": 0.0002, + "loss": 1.2202, + "step": 2510 + }, + { + "epoch": 5.587583148558759, + "grad_norm": 1.107680320739746, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 2520 + }, + { + "epoch": 5.609756097560975, + "grad_norm": 0.8842377662658691, + "learning_rate": 0.0002, + "loss": 1.2117, + "step": 2530 + }, + { + "epoch": 5.631929046563193, + "grad_norm": 0.9856716990470886, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 2540 + }, + { + "epoch": 5.65410199556541, + "grad_norm": 1.0363198518753052, + "learning_rate": 0.0002, + "loss": 1.2579, + "step": 2550 + }, + { + "epoch": 5.676274944567627, + "grad_norm": 0.9366242289543152, + "learning_rate": 0.0002, + "loss": 1.236, + "step": 2560 + }, + { + "epoch": 5.698447893569845, + "grad_norm": 0.9180609583854675, + "learning_rate": 0.0002, + "loss": 1.2652, + "step": 2570 + }, + { + "epoch": 5.720620842572062, + "grad_norm": 0.96494460105896, + "learning_rate": 0.0002, + "loss": 1.2153, + "step": 2580 + }, + { + "epoch": 5.74279379157428, + "grad_norm": 1.066856861114502, + "learning_rate": 0.0002, + "loss": 1.2596, + "step": 2590 + }, + { + "epoch": 5.764966740576496, + "grad_norm": 1.0576446056365967, + "learning_rate": 0.0002, + "loss": 1.2437, + "step": 2600 + }, + { + "epoch": 5.787139689578714, + "grad_norm": 1.0688375234603882, + "learning_rate": 0.0002, + "loss": 1.2449, + "step": 2610 + }, + { + "epoch": 5.8093126385809315, + "grad_norm": 0.9294432401657104, + "learning_rate": 0.0002, + "loss": 1.2094, + "step": 2620 + }, + { + "epoch": 5.831485587583149, + "grad_norm": 0.9467836618423462, + "learning_rate": 0.0002, + "loss": 1.3705, + "step": 2630 + }, + { + "epoch": 5.853658536585366, + "grad_norm": 1.1947448253631592, + "learning_rate": 0.0002, + "loss": 1.334, + "step": 2640 + }, + { + "epoch": 5.875831485587583, + "grad_norm": 0.9225861430168152, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 2650 + }, + { + "epoch": 5.898004434589801, + "grad_norm": 0.9499539136886597, + "learning_rate": 0.0002, + "loss": 1.3356, + "step": 2660 + }, + { + "epoch": 5.9201773835920175, + "grad_norm": 0.9666298031806946, + "learning_rate": 0.0002, + "loss": 1.2898, + "step": 2670 + }, + { + "epoch": 5.942350332594235, + "grad_norm": 1.0549718141555786, + "learning_rate": 0.0002, + "loss": 1.1846, + "step": 2680 + }, + { + "epoch": 5.964523281596453, + "grad_norm": 1.1662505865097046, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 2690 + }, + { + "epoch": 5.986696230598669, + "grad_norm": 0.9200838208198547, + "learning_rate": 0.0002, + "loss": 1.2717, + "step": 2700 + }, + { + "epoch": 6.0, + "eval_loss": 2.089076280593872, + "eval_runtime": 95.2405, + "eval_samples_per_second": 5.407, + "eval_steps_per_second": 0.682, + "step": 2706 + } + ], + "logging_steps": 10, + "max_steps": 3608, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2522762102059827e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcdc3b48752889d03e0bc8f748c28d3a3eeb1026 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-2706/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4002b3199d379e5c6c8494417ad0ab221d2f73797ae8bbbb3b91f811953dbe5c +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6f0ac9d1d51559d5cb7d7c2a9ac9b2c57c96ccf6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e458aedc3fa0beff6a910f5030ac0c5b6d49a9b90bba7adf8552d7ed0de510c +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d0e644401333137a29e4b59f8c2b3207ef44527 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e643395954fb3f5790b33c4134c1b00deddf61675d54b518b6756106fc8ea30f +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cdd3dea6c10904318eeb26925677834f906eb4ab --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27512a55bb510cd7ec345bc1a222b2b8a48be62465c71e1be80cb3173ac6c187 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..833cbdb787f997ce6a339b910ef0417b8fd1e28d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:992150560499ecd1b0078ee0d6d46b2a40bc474fa9be570db3c7340c29221fbd +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..692ba9b43206abaa771c41d4c068d85a8b0cbba8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/trainer_state.json @@ -0,0 +1,2294 @@ +{ + "best_metric": 1.8243104219436646, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902", + "epoch": 7.0, + "eval_steps": 10, + "global_step": 3157, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022172949002217297, + "grad_norm": 0.6454975008964539, + "learning_rate": 0.0002, + "loss": 2.684, + "step": 10 + }, + { + "epoch": 0.04434589800443459, + "grad_norm": 0.5452715158462524, + "learning_rate": 0.0002, + "loss": 2.2694, + "step": 20 + }, + { + "epoch": 0.06651884700665188, + "grad_norm": 0.5502195358276367, + "learning_rate": 0.0002, + "loss": 2.0796, + "step": 30 + }, + { + "epoch": 0.08869179600886919, + "grad_norm": 0.48551198840141296, + "learning_rate": 0.0002, + "loss": 1.9132, + "step": 40 + }, + { + "epoch": 0.11086474501108648, + "grad_norm": 0.47822514176368713, + "learning_rate": 0.0002, + "loss": 2.016, + "step": 50 + }, + { + "epoch": 0.13303769401330376, + "grad_norm": 0.5125395655632019, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 60 + }, + { + "epoch": 0.15521064301552107, + "grad_norm": 0.4600693881511688, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 70 + }, + { + "epoch": 0.17738359201773837, + "grad_norm": 0.7180814743041992, + "learning_rate": 0.0002, + "loss": 2.0057, + "step": 80 + }, + { + "epoch": 0.19955654101995565, + "grad_norm": 0.4712974429130554, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 90 + }, + { + "epoch": 0.22172949002217296, + "grad_norm": 0.4673261344432831, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 100 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 0.4129070043563843, + "learning_rate": 0.0002, + "loss": 1.9346, + "step": 110 + }, + { + "epoch": 0.2660753880266075, + "grad_norm": 0.3859104812145233, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 120 + }, + { + "epoch": 0.28824833702882485, + "grad_norm": 0.40966713428497314, + "learning_rate": 0.0002, + "loss": 1.8922, + "step": 130 + }, + { + "epoch": 0.31042128603104213, + "grad_norm": 0.3685867488384247, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 140 + }, + { + "epoch": 0.3325942350332594, + "grad_norm": 0.39279988408088684, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 150 + }, + { + "epoch": 0.35476718403547675, + "grad_norm": 0.4195398986339569, + "learning_rate": 0.0002, + "loss": 1.8556, + "step": 160 + }, + { + "epoch": 0.376940133037694, + "grad_norm": 0.469802588224411, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 170 + }, + { + "epoch": 0.3991130820399113, + "grad_norm": 0.4069509208202362, + "learning_rate": 0.0002, + "loss": 1.8135, + "step": 180 + }, + { + "epoch": 0.4212860310421286, + "grad_norm": 0.47832027077674866, + "learning_rate": 0.0002, + "loss": 1.8429, + "step": 190 + }, + { + "epoch": 0.4434589800443459, + "grad_norm": 0.3376411199569702, + "learning_rate": 0.0002, + "loss": 1.781, + "step": 200 + }, + { + "epoch": 0.4656319290465632, + "grad_norm": 0.3787185847759247, + "learning_rate": 0.0002, + "loss": 1.8562, + "step": 210 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 0.40322697162628174, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 220 + }, + { + "epoch": 0.5099778270509978, + "grad_norm": 0.3710436522960663, + "learning_rate": 0.0002, + "loss": 1.8007, + "step": 230 + }, + { + "epoch": 0.532150776053215, + "grad_norm": 0.3723200261592865, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 240 + }, + { + "epoch": 0.5543237250554324, + "grad_norm": 0.3457179069519043, + "learning_rate": 0.0002, + "loss": 1.852, + "step": 250 + }, + { + "epoch": 0.5764966740576497, + "grad_norm": 0.35369473695755005, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 260 + }, + { + "epoch": 0.5986696230598669, + "grad_norm": 0.3667483329772949, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 270 + }, + { + "epoch": 0.6208425720620843, + "grad_norm": 0.4023273289203644, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 280 + }, + { + "epoch": 0.6430155210643016, + "grad_norm": 0.3601929843425751, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 290 + }, + { + "epoch": 0.6651884700665188, + "grad_norm": 0.32610392570495605, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 300 + }, + { + "epoch": 0.6873614190687362, + "grad_norm": 0.40528756380081177, + "learning_rate": 0.0002, + "loss": 1.8412, + "step": 310 + }, + { + "epoch": 0.7095343680709535, + "grad_norm": 0.34639739990234375, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 320 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 0.3794991374015808, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 330 + }, + { + "epoch": 0.753880266075388, + "grad_norm": 0.34203875064849854, + "learning_rate": 0.0002, + "loss": 1.8678, + "step": 340 + }, + { + "epoch": 0.7760532150776053, + "grad_norm": 0.36692821979522705, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 350 + }, + { + "epoch": 0.7982261640798226, + "grad_norm": 0.3701125979423523, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 360 + }, + { + "epoch": 0.8203991130820399, + "grad_norm": 0.3971416652202606, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 370 + }, + { + "epoch": 0.8425720620842572, + "grad_norm": 0.3751989006996155, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 380 + }, + { + "epoch": 0.8647450110864745, + "grad_norm": 0.35116496682167053, + "learning_rate": 0.0002, + "loss": 1.8281, + "step": 390 + }, + { + "epoch": 0.8869179600886918, + "grad_norm": 0.3672674894332886, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 400 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.34648260474205017, + "learning_rate": 0.0002, + "loss": 1.8293, + "step": 410 + }, + { + "epoch": 0.9312638580931264, + "grad_norm": 0.4497389793395996, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 420 + }, + { + "epoch": 0.9534368070953437, + "grad_norm": 0.33595147728919983, + "learning_rate": 0.0002, + "loss": 1.8405, + "step": 430 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 0.3130456805229187, + "learning_rate": 0.0002, + "loss": 1.8137, + "step": 440 + }, + { + "epoch": 0.9977827050997783, + "grad_norm": 0.36480239033699036, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8323718309402466, + "eval_runtime": 79.9603, + "eval_samples_per_second": 6.441, + "eval_steps_per_second": 0.813, + "step": 451 + }, + { + "epoch": 1.0199556541019956, + "grad_norm": 0.3840029835700989, + "learning_rate": 0.0002, + "loss": 1.7921, + "step": 460 + }, + { + "epoch": 1.042128603104213, + "grad_norm": 0.33457425236701965, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 470 + }, + { + "epoch": 1.06430155210643, + "grad_norm": 0.35766592621803284, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 480 + }, + { + "epoch": 1.0864745011086474, + "grad_norm": 0.38070937991142273, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 490 + }, + { + "epoch": 1.1086474501108647, + "grad_norm": 0.38546547293663025, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 500 + }, + { + "epoch": 1.130820399113082, + "grad_norm": 0.384104460477829, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 510 + }, + { + "epoch": 1.1529933481152994, + "grad_norm": 0.3556116819381714, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 520 + }, + { + "epoch": 1.1751662971175167, + "grad_norm": 0.4110541343688965, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 530 + }, + { + "epoch": 1.1973392461197339, + "grad_norm": 0.46503177285194397, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 540 + }, + { + "epoch": 1.2195121951219512, + "grad_norm": 0.4366816580295563, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 550 + }, + { + "epoch": 1.2416851441241685, + "grad_norm": 0.379986047744751, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 560 + }, + { + "epoch": 1.2638580931263859, + "grad_norm": 0.3920869529247284, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 570 + }, + { + "epoch": 1.2860310421286032, + "grad_norm": 0.4013986587524414, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 580 + }, + { + "epoch": 1.3082039911308203, + "grad_norm": 0.39104390144348145, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 590 + }, + { + "epoch": 1.3303769401330376, + "grad_norm": 0.40515613555908203, + "learning_rate": 0.0002, + "loss": 1.7822, + "step": 600 + }, + { + "epoch": 1.352549889135255, + "grad_norm": 0.4212331473827362, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 610 + }, + { + "epoch": 1.3747228381374723, + "grad_norm": 0.36040815711021423, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 620 + }, + { + "epoch": 1.3968957871396896, + "grad_norm": 0.3950865864753723, + "learning_rate": 0.0002, + "loss": 1.7467, + "step": 630 + }, + { + "epoch": 1.4190687361419068, + "grad_norm": 0.3934709131717682, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 640 + }, + { + "epoch": 1.441241685144124, + "grad_norm": 0.3905350863933563, + "learning_rate": 0.0002, + "loss": 1.7783, + "step": 650 + }, + { + "epoch": 1.4634146341463414, + "grad_norm": 0.4322686493396759, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 660 + }, + { + "epoch": 1.4855875831485588, + "grad_norm": 0.35697034001350403, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 670 + }, + { + "epoch": 1.507760532150776, + "grad_norm": 0.38570451736450195, + "learning_rate": 0.0002, + "loss": 1.6764, + "step": 680 + }, + { + "epoch": 1.5299334811529932, + "grad_norm": 0.3804517090320587, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 690 + }, + { + "epoch": 1.5521064301552108, + "grad_norm": 0.4938165247440338, + "learning_rate": 0.0002, + "loss": 1.7725, + "step": 700 + }, + { + "epoch": 1.5742793791574279, + "grad_norm": 0.43075236678123474, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 710 + }, + { + "epoch": 1.5964523281596452, + "grad_norm": 0.40434643626213074, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 720 + }, + { + "epoch": 1.6186252771618626, + "grad_norm": 0.3874157667160034, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 730 + }, + { + "epoch": 1.6407982261640797, + "grad_norm": 0.3645969331264496, + "learning_rate": 0.0002, + "loss": 1.8141, + "step": 740 + }, + { + "epoch": 1.6629711751662972, + "grad_norm": 0.38588255643844604, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 750 + }, + { + "epoch": 1.6851441241685143, + "grad_norm": 0.39252519607543945, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 760 + }, + { + "epoch": 1.7073170731707317, + "grad_norm": 0.36165162920951843, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 770 + }, + { + "epoch": 1.729490022172949, + "grad_norm": 0.36677947640419006, + "learning_rate": 0.0002, + "loss": 1.7172, + "step": 780 + }, + { + "epoch": 1.7516629711751663, + "grad_norm": 0.374881774187088, + "learning_rate": 0.0002, + "loss": 1.7737, + "step": 790 + }, + { + "epoch": 1.7738359201773837, + "grad_norm": 0.4530802369117737, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 800 + }, + { + "epoch": 1.7960088691796008, + "grad_norm": 0.3879568576812744, + "learning_rate": 0.0002, + "loss": 1.7799, + "step": 810 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.3710079789161682, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 820 + }, + { + "epoch": 1.8403547671840355, + "grad_norm": 0.3831799030303955, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 830 + }, + { + "epoch": 1.8625277161862528, + "grad_norm": 0.3958432376384735, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 840 + }, + { + "epoch": 1.8847006651884701, + "grad_norm": 0.4129294157028198, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 850 + }, + { + "epoch": 1.9068736141906872, + "grad_norm": 0.3714745044708252, + "learning_rate": 0.0002, + "loss": 1.6506, + "step": 860 + }, + { + "epoch": 1.9290465631929048, + "grad_norm": 0.40176868438720703, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 870 + }, + { + "epoch": 1.951219512195122, + "grad_norm": 0.36937767267227173, + "learning_rate": 0.0002, + "loss": 1.7557, + "step": 880 + }, + { + "epoch": 1.9733924611973392, + "grad_norm": 0.40242597460746765, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 890 + }, + { + "epoch": 1.9955654101995566, + "grad_norm": 0.3515510559082031, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 900 + }, + { + "epoch": 2.0, + "eval_loss": 1.8243104219436646, + "eval_runtime": 107.8856, + "eval_samples_per_second": 4.774, + "eval_steps_per_second": 0.602, + "step": 902 + }, + { + "epoch": 2.0177383592017737, + "grad_norm": 0.4145216643810272, + "learning_rate": 0.0002, + "loss": 1.6631, + "step": 910 + }, + { + "epoch": 2.0399113082039912, + "grad_norm": 0.42423519492149353, + "learning_rate": 0.0002, + "loss": 1.7227, + "step": 920 + }, + { + "epoch": 2.0620842572062084, + "grad_norm": 0.4773229956626892, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 930 + }, + { + "epoch": 2.084257206208426, + "grad_norm": 0.4144791066646576, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 940 + }, + { + "epoch": 2.106430155210643, + "grad_norm": 0.42704132199287415, + "learning_rate": 0.0002, + "loss": 1.6433, + "step": 950 + }, + { + "epoch": 2.12860310421286, + "grad_norm": 0.4479042589664459, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 960 + }, + { + "epoch": 2.1507760532150777, + "grad_norm": 0.4810638129711151, + "learning_rate": 0.0002, + "loss": 1.6122, + "step": 970 + }, + { + "epoch": 2.172949002217295, + "grad_norm": 0.48669910430908203, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 980 + }, + { + "epoch": 2.1951219512195124, + "grad_norm": 0.4252761900424957, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 990 + }, + { + "epoch": 2.2172949002217295, + "grad_norm": 0.42342790961265564, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 1000 + }, + { + "epoch": 2.2394678492239466, + "grad_norm": 0.43432456254959106, + "learning_rate": 0.0002, + "loss": 1.637, + "step": 1010 + }, + { + "epoch": 2.261640798226164, + "grad_norm": 0.45556965470314026, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 1020 + }, + { + "epoch": 2.2838137472283813, + "grad_norm": 0.48035719990730286, + "learning_rate": 0.0002, + "loss": 1.6554, + "step": 1030 + }, + { + "epoch": 2.305986696230599, + "grad_norm": 0.4233241081237793, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 1040 + }, + { + "epoch": 2.328159645232816, + "grad_norm": 0.3918434679508209, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 1050 + }, + { + "epoch": 2.3503325942350335, + "grad_norm": 0.44049757719039917, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 1060 + }, + { + "epoch": 2.3725055432372506, + "grad_norm": 0.4730056822299957, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 1070 + }, + { + "epoch": 2.3946784922394677, + "grad_norm": 0.4354589581489563, + "learning_rate": 0.0002, + "loss": 1.6104, + "step": 1080 + }, + { + "epoch": 2.4168514412416853, + "grad_norm": 0.4837590456008911, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 1090 + }, + { + "epoch": 2.4390243902439024, + "grad_norm": 0.4842571020126343, + "learning_rate": 0.0002, + "loss": 1.6672, + "step": 1100 + }, + { + "epoch": 2.4611973392461195, + "grad_norm": 0.46398279070854187, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 1110 + }, + { + "epoch": 2.483370288248337, + "grad_norm": 0.4587327539920807, + "learning_rate": 0.0002, + "loss": 1.6393, + "step": 1120 + }, + { + "epoch": 2.505543237250554, + "grad_norm": 0.4336528480052948, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 1130 + }, + { + "epoch": 2.5277161862527717, + "grad_norm": 0.6162153482437134, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1140 + }, + { + "epoch": 2.549889135254989, + "grad_norm": 0.48175573348999023, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1150 + }, + { + "epoch": 2.5720620842572064, + "grad_norm": 0.448272705078125, + "learning_rate": 0.0002, + "loss": 1.6098, + "step": 1160 + }, + { + "epoch": 2.5942350332594235, + "grad_norm": 0.5189200639724731, + "learning_rate": 0.0002, + "loss": 1.6987, + "step": 1170 + }, + { + "epoch": 2.6164079822616406, + "grad_norm": 0.45032963156700134, + "learning_rate": 0.0002, + "loss": 1.6503, + "step": 1180 + }, + { + "epoch": 2.638580931263858, + "grad_norm": 0.4417729377746582, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 1190 + }, + { + "epoch": 2.6607538802660753, + "grad_norm": 0.5219636559486389, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 1200 + }, + { + "epoch": 2.682926829268293, + "grad_norm": 0.47702011466026306, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 1210 + }, + { + "epoch": 2.70509977827051, + "grad_norm": 0.4328458607196808, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1220 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.46762076020240784, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 1230 + }, + { + "epoch": 2.7494456762749446, + "grad_norm": 0.4592697322368622, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 1240 + }, + { + "epoch": 2.7716186252771617, + "grad_norm": 0.5519265532493591, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 1250 + }, + { + "epoch": 2.7937915742793793, + "grad_norm": 0.47169506549835205, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1260 + }, + { + "epoch": 2.8159645232815964, + "grad_norm": 0.47231653332710266, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 1270 + }, + { + "epoch": 2.8381374722838135, + "grad_norm": 0.49081969261169434, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 1280 + }, + { + "epoch": 2.860310421286031, + "grad_norm": 0.4483231008052826, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 1290 + }, + { + "epoch": 2.882483370288248, + "grad_norm": 0.5310035943984985, + "learning_rate": 0.0002, + "loss": 1.6428, + "step": 1300 + }, + { + "epoch": 2.9046563192904657, + "grad_norm": 0.4419795572757721, + "learning_rate": 0.0002, + "loss": 1.6515, + "step": 1310 + }, + { + "epoch": 2.926829268292683, + "grad_norm": 0.44630762934684753, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 1320 + }, + { + "epoch": 2.9490022172949004, + "grad_norm": 0.39774850010871887, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 1330 + }, + { + "epoch": 2.9711751662971175, + "grad_norm": 0.441727876663208, + "learning_rate": 0.0002, + "loss": 1.7007, + "step": 1340 + }, + { + "epoch": 2.9933481152993346, + "grad_norm": 0.43773892521858215, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 1350 + }, + { + "epoch": 3.0, + "eval_loss": 1.8514760732650757, + "eval_runtime": 131.9812, + "eval_samples_per_second": 3.902, + "eval_steps_per_second": 0.492, + "step": 1353 + }, + { + "epoch": 3.015521064301552, + "grad_norm": 0.5274150371551514, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 1360 + }, + { + "epoch": 3.0376940133037693, + "grad_norm": 0.5724489092826843, + "learning_rate": 0.0002, + "loss": 1.4784, + "step": 1370 + }, + { + "epoch": 3.059866962305987, + "grad_norm": 0.6182316541671753, + "learning_rate": 0.0002, + "loss": 1.5365, + "step": 1380 + }, + { + "epoch": 3.082039911308204, + "grad_norm": 0.5709688067436218, + "learning_rate": 0.0002, + "loss": 1.4824, + "step": 1390 + }, + { + "epoch": 3.104212860310421, + "grad_norm": 0.6368464231491089, + "learning_rate": 0.0002, + "loss": 1.534, + "step": 1400 + }, + { + "epoch": 3.1263858093126387, + "grad_norm": 0.5680432319641113, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 1410 + }, + { + "epoch": 3.1485587583148558, + "grad_norm": 0.5805315375328064, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 1420 + }, + { + "epoch": 3.1707317073170733, + "grad_norm": 0.5782836675643921, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 1430 + }, + { + "epoch": 3.1929046563192904, + "grad_norm": 0.627159595489502, + "learning_rate": 0.0002, + "loss": 1.4852, + "step": 1440 + }, + { + "epoch": 3.2150776053215075, + "grad_norm": 0.6136751174926758, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 1450 + }, + { + "epoch": 3.237250554323725, + "grad_norm": 0.6319093108177185, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 1460 + }, + { + "epoch": 3.259423503325942, + "grad_norm": 0.7641780972480774, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 1470 + }, + { + "epoch": 3.2815964523281598, + "grad_norm": 0.6116001605987549, + "learning_rate": 0.0002, + "loss": 1.5514, + "step": 1480 + }, + { + "epoch": 3.303769401330377, + "grad_norm": 0.6024722456932068, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 1490 + }, + { + "epoch": 3.3259423503325944, + "grad_norm": 0.5941570997238159, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 1500 + }, + { + "epoch": 3.3481152993348116, + "grad_norm": 0.608369767665863, + "learning_rate": 0.0002, + "loss": 1.5104, + "step": 1510 + }, + { + "epoch": 3.3702882483370287, + "grad_norm": 0.5942065715789795, + "learning_rate": 0.0002, + "loss": 1.5494, + "step": 1520 + }, + { + "epoch": 3.3924611973392462, + "grad_norm": 0.6382330656051636, + "learning_rate": 0.0002, + "loss": 1.5426, + "step": 1530 + }, + { + "epoch": 3.4146341463414633, + "grad_norm": 0.5839648842811584, + "learning_rate": 0.0002, + "loss": 1.5479, + "step": 1540 + }, + { + "epoch": 3.436807095343681, + "grad_norm": 0.5627358555793762, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 1550 + }, + { + "epoch": 3.458980044345898, + "grad_norm": 0.6342151761054993, + "learning_rate": 0.0002, + "loss": 1.5679, + "step": 1560 + }, + { + "epoch": 3.481152993348115, + "grad_norm": 0.6370542645454407, + "learning_rate": 0.0002, + "loss": 1.5005, + "step": 1570 + }, + { + "epoch": 3.5033259423503327, + "grad_norm": 0.5974680185317993, + "learning_rate": 0.0002, + "loss": 1.541, + "step": 1580 + }, + { + "epoch": 3.52549889135255, + "grad_norm": 0.6197021007537842, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 1590 + }, + { + "epoch": 3.5476718403547673, + "grad_norm": 0.6413024067878723, + "learning_rate": 0.0002, + "loss": 1.5287, + "step": 1600 + }, + { + "epoch": 3.5698447893569845, + "grad_norm": 0.5878410339355469, + "learning_rate": 0.0002, + "loss": 1.5301, + "step": 1610 + }, + { + "epoch": 3.5920177383592016, + "grad_norm": 0.6485083103179932, + "learning_rate": 0.0002, + "loss": 1.4625, + "step": 1620 + }, + { + "epoch": 3.614190687361419, + "grad_norm": 0.5826634764671326, + "learning_rate": 0.0002, + "loss": 1.5373, + "step": 1630 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.8906663656234741, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 1640 + }, + { + "epoch": 3.658536585365854, + "grad_norm": 0.6288479566574097, + "learning_rate": 0.0002, + "loss": 1.5208, + "step": 1650 + }, + { + "epoch": 3.680709534368071, + "grad_norm": 0.6191049218177795, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 1660 + }, + { + "epoch": 3.7028824833702885, + "grad_norm": 0.5997978448867798, + "learning_rate": 0.0002, + "loss": 1.5043, + "step": 1670 + }, + { + "epoch": 3.7250554323725056, + "grad_norm": 0.6003038287162781, + "learning_rate": 0.0002, + "loss": 1.5654, + "step": 1680 + }, + { + "epoch": 3.7472283813747227, + "grad_norm": 0.5417194962501526, + "learning_rate": 0.0002, + "loss": 1.4941, + "step": 1690 + }, + { + "epoch": 3.7694013303769403, + "grad_norm": 0.6367442607879639, + "learning_rate": 0.0002, + "loss": 1.5541, + "step": 1700 + }, + { + "epoch": 3.7915742793791574, + "grad_norm": 0.6613120436668396, + "learning_rate": 0.0002, + "loss": 1.5483, + "step": 1710 + }, + { + "epoch": 3.8137472283813745, + "grad_norm": 0.6506749391555786, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 1720 + }, + { + "epoch": 3.835920177383592, + "grad_norm": 0.5478500723838806, + "learning_rate": 0.0002, + "loss": 1.5207, + "step": 1730 + }, + { + "epoch": 3.858093126385809, + "grad_norm": 0.7313215732574463, + "learning_rate": 0.0002, + "loss": 1.5619, + "step": 1740 + }, + { + "epoch": 3.8802660753880267, + "grad_norm": 0.5453857183456421, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 1750 + }, + { + "epoch": 3.902439024390244, + "grad_norm": 0.5983547568321228, + "learning_rate": 0.0002, + "loss": 1.4857, + "step": 1760 + }, + { + "epoch": 3.9246119733924614, + "grad_norm": 0.6471580266952515, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 1770 + }, + { + "epoch": 3.9467849223946785, + "grad_norm": 0.5833685398101807, + "learning_rate": 0.0002, + "loss": 1.461, + "step": 1780 + }, + { + "epoch": 3.9689578713968956, + "grad_norm": 0.5509327054023743, + "learning_rate": 0.0002, + "loss": 1.5014, + "step": 1790 + }, + { + "epoch": 3.991130820399113, + "grad_norm": 0.6021352410316467, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 1800 + }, + { + "epoch": 4.0, + "eval_loss": 1.901047945022583, + "eval_runtime": 82.2708, + "eval_samples_per_second": 6.26, + "eval_steps_per_second": 0.79, + "step": 1804 + }, + { + "epoch": 4.013303769401331, + "grad_norm": 0.6232016682624817, + "learning_rate": 0.0002, + "loss": 1.422, + "step": 1810 + }, + { + "epoch": 4.035476718403547, + "grad_norm": 0.7521207928657532, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 1820 + }, + { + "epoch": 4.057649667405765, + "grad_norm": 0.7839062213897705, + "learning_rate": 0.0002, + "loss": 1.4481, + "step": 1830 + }, + { + "epoch": 4.0798226164079825, + "grad_norm": 0.8654165863990784, + "learning_rate": 0.0002, + "loss": 1.4147, + "step": 1840 + }, + { + "epoch": 4.101995565410199, + "grad_norm": 0.6872738599777222, + "learning_rate": 0.0002, + "loss": 1.2983, + "step": 1850 + }, + { + "epoch": 4.124168514412417, + "grad_norm": 0.7529677748680115, + "learning_rate": 0.0002, + "loss": 1.3115, + "step": 1860 + }, + { + "epoch": 4.146341463414634, + "grad_norm": 0.835027277469635, + "learning_rate": 0.0002, + "loss": 1.3869, + "step": 1870 + }, + { + "epoch": 4.168514412416852, + "grad_norm": 0.7457721829414368, + "learning_rate": 0.0002, + "loss": 1.3273, + "step": 1880 + }, + { + "epoch": 4.1906873614190685, + "grad_norm": 0.7366040349006653, + "learning_rate": 0.0002, + "loss": 1.2893, + "step": 1890 + }, + { + "epoch": 4.212860310421286, + "grad_norm": 0.7802833914756775, + "learning_rate": 0.0002, + "loss": 1.3615, + "step": 1900 + }, + { + "epoch": 4.235033259423504, + "grad_norm": 0.7526614665985107, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 1910 + }, + { + "epoch": 4.25720620842572, + "grad_norm": 0.7531310319900513, + "learning_rate": 0.0002, + "loss": 1.4384, + "step": 1920 + }, + { + "epoch": 4.279379157427938, + "grad_norm": 0.8899626135826111, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 1930 + }, + { + "epoch": 4.301552106430155, + "grad_norm": 0.7591356635093689, + "learning_rate": 0.0002, + "loss": 1.328, + "step": 1940 + }, + { + "epoch": 4.323725055432373, + "grad_norm": 0.7126884460449219, + "learning_rate": 0.0002, + "loss": 1.4114, + "step": 1950 + }, + { + "epoch": 4.34589800443459, + "grad_norm": 0.7907777428627014, + "learning_rate": 0.0002, + "loss": 1.4259, + "step": 1960 + }, + { + "epoch": 4.368070953436807, + "grad_norm": 0.7854869961738586, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 1970 + }, + { + "epoch": 4.390243902439025, + "grad_norm": 0.6982123851776123, + "learning_rate": 0.0002, + "loss": 1.4126, + "step": 1980 + }, + { + "epoch": 4.412416851441241, + "grad_norm": 0.7551925182342529, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 1990 + }, + { + "epoch": 4.434589800443459, + "grad_norm": 0.864078164100647, + "learning_rate": 0.0002, + "loss": 1.4551, + "step": 2000 + }, + { + "epoch": 4.4567627494456765, + "grad_norm": 0.8406776189804077, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 2010 + }, + { + "epoch": 4.478935698447893, + "grad_norm": 0.7706766724586487, + "learning_rate": 0.0002, + "loss": 1.3543, + "step": 2020 + }, + { + "epoch": 4.501108647450111, + "grad_norm": 0.7703949213027954, + "learning_rate": 0.0002, + "loss": 1.386, + "step": 2030 + }, + { + "epoch": 4.523281596452328, + "grad_norm": 0.8654166460037231, + "learning_rate": 0.0002, + "loss": 1.4059, + "step": 2040 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.7800114750862122, + "learning_rate": 0.0002, + "loss": 1.4067, + "step": 2050 + }, + { + "epoch": 4.5676274944567625, + "grad_norm": 0.7553898692131042, + "learning_rate": 0.0002, + "loss": 1.3578, + "step": 2060 + }, + { + "epoch": 4.58980044345898, + "grad_norm": 0.8689188957214355, + "learning_rate": 0.0002, + "loss": 1.3845, + "step": 2070 + }, + { + "epoch": 4.611973392461198, + "grad_norm": 0.7244092226028442, + "learning_rate": 0.0002, + "loss": 1.3851, + "step": 2080 + }, + { + "epoch": 4.634146341463414, + "grad_norm": 0.9829743504524231, + "learning_rate": 0.0002, + "loss": 1.3627, + "step": 2090 + }, + { + "epoch": 4.656319290465632, + "grad_norm": 0.8026102185249329, + "learning_rate": 0.0002, + "loss": 1.4059, + "step": 2100 + }, + { + "epoch": 4.678492239467849, + "grad_norm": 0.6725143194198608, + "learning_rate": 0.0002, + "loss": 1.3676, + "step": 2110 + }, + { + "epoch": 4.700665188470067, + "grad_norm": 0.8055245876312256, + "learning_rate": 0.0002, + "loss": 1.4669, + "step": 2120 + }, + { + "epoch": 4.722838137472284, + "grad_norm": 0.7507025003433228, + "learning_rate": 0.0002, + "loss": 1.4455, + "step": 2130 + }, + { + "epoch": 4.745011086474501, + "grad_norm": 0.7166216969490051, + "learning_rate": 0.0002, + "loss": 1.3974, + "step": 2140 + }, + { + "epoch": 4.767184035476719, + "grad_norm": 0.6826853156089783, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 2150 + }, + { + "epoch": 4.789356984478935, + "grad_norm": 1.1347891092300415, + "learning_rate": 0.0002, + "loss": 1.3907, + "step": 2160 + }, + { + "epoch": 4.811529933481153, + "grad_norm": 0.8205971121788025, + "learning_rate": 0.0002, + "loss": 1.3737, + "step": 2170 + }, + { + "epoch": 4.8337028824833705, + "grad_norm": 0.7861950397491455, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 2180 + }, + { + "epoch": 4.855875831485587, + "grad_norm": 0.839460551738739, + "learning_rate": 0.0002, + "loss": 1.4293, + "step": 2190 + }, + { + "epoch": 4.878048780487805, + "grad_norm": 0.746583878993988, + "learning_rate": 0.0002, + "loss": 1.3881, + "step": 2200 + }, + { + "epoch": 4.900221729490022, + "grad_norm": 0.7805684804916382, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 2210 + }, + { + "epoch": 4.922394678492239, + "grad_norm": 0.8079700469970703, + "learning_rate": 0.0002, + "loss": 1.4053, + "step": 2220 + }, + { + "epoch": 4.9445676274944566, + "grad_norm": 0.7609502673149109, + "learning_rate": 0.0002, + "loss": 1.353, + "step": 2230 + }, + { + "epoch": 4.966740576496674, + "grad_norm": 0.7862996459007263, + "learning_rate": 0.0002, + "loss": 1.3816, + "step": 2240 + }, + { + "epoch": 4.988913525498892, + "grad_norm": 0.778677225112915, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 2250 + }, + { + "epoch": 5.0, + "eval_loss": 1.9658271074295044, + "eval_runtime": 108.3717, + "eval_samples_per_second": 4.752, + "eval_steps_per_second": 0.6, + "step": 2255 + }, + { + "epoch": 5.011086474501108, + "grad_norm": 0.7520418167114258, + "learning_rate": 0.0002, + "loss": 1.3395, + "step": 2260 + }, + { + "epoch": 5.033259423503326, + "grad_norm": 1.1831114292144775, + "learning_rate": 0.0002, + "loss": 1.1909, + "step": 2270 + }, + { + "epoch": 5.0554323725055434, + "grad_norm": 0.8718661069869995, + "learning_rate": 0.0002, + "loss": 1.1784, + "step": 2280 + }, + { + "epoch": 5.07760532150776, + "grad_norm": 1.0186705589294434, + "learning_rate": 0.0002, + "loss": 1.2208, + "step": 2290 + }, + { + "epoch": 5.099778270509978, + "grad_norm": 1.0370045900344849, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 2300 + }, + { + "epoch": 5.121951219512195, + "grad_norm": 0.9448253512382507, + "learning_rate": 0.0002, + "loss": 1.1485, + "step": 2310 + }, + { + "epoch": 5.144124168514413, + "grad_norm": 0.988973081111908, + "learning_rate": 0.0002, + "loss": 1.1764, + "step": 2320 + }, + { + "epoch": 5.1662971175166295, + "grad_norm": 0.9368142485618591, + "learning_rate": 0.0002, + "loss": 1.1544, + "step": 2330 + }, + { + "epoch": 5.188470066518847, + "grad_norm": 1.0289298295974731, + "learning_rate": 0.0002, + "loss": 1.2416, + "step": 2340 + }, + { + "epoch": 5.210643015521065, + "grad_norm": 0.9611420035362244, + "learning_rate": 0.0002, + "loss": 1.1982, + "step": 2350 + }, + { + "epoch": 5.232815964523281, + "grad_norm": 0.8490312099456787, + "learning_rate": 0.0002, + "loss": 1.2046, + "step": 2360 + }, + { + "epoch": 5.254988913525499, + "grad_norm": 1.0165891647338867, + "learning_rate": 0.0002, + "loss": 1.2504, + "step": 2370 + }, + { + "epoch": 5.277161862527716, + "grad_norm": 0.9902606010437012, + "learning_rate": 0.0002, + "loss": 1.2749, + "step": 2380 + }, + { + "epoch": 5.299334811529933, + "grad_norm": 0.987205445766449, + "learning_rate": 0.0002, + "loss": 1.2186, + "step": 2390 + }, + { + "epoch": 5.321507760532151, + "grad_norm": 0.7931132316589355, + "learning_rate": 0.0002, + "loss": 1.1962, + "step": 2400 + }, + { + "epoch": 5.343680709534368, + "grad_norm": 1.143110990524292, + "learning_rate": 0.0002, + "loss": 1.1661, + "step": 2410 + }, + { + "epoch": 5.365853658536586, + "grad_norm": 0.9869807362556458, + "learning_rate": 0.0002, + "loss": 1.191, + "step": 2420 + }, + { + "epoch": 5.388026607538802, + "grad_norm": 0.9835564494132996, + "learning_rate": 0.0002, + "loss": 1.208, + "step": 2430 + }, + { + "epoch": 5.41019955654102, + "grad_norm": 0.8321971893310547, + "learning_rate": 0.0002, + "loss": 1.2734, + "step": 2440 + }, + { + "epoch": 5.4323725055432375, + "grad_norm": 0.8379601240158081, + "learning_rate": 0.0002, + "loss": 1.2308, + "step": 2450 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 0.9872745871543884, + "learning_rate": 0.0002, + "loss": 1.2464, + "step": 2460 + }, + { + "epoch": 5.476718403547672, + "grad_norm": 0.9455783367156982, + "learning_rate": 0.0002, + "loss": 1.283, + "step": 2470 + }, + { + "epoch": 5.498891352549889, + "grad_norm": 0.9594705700874329, + "learning_rate": 0.0002, + "loss": 1.2585, + "step": 2480 + }, + { + "epoch": 5.521064301552107, + "grad_norm": 1.036603331565857, + "learning_rate": 0.0002, + "loss": 1.2776, + "step": 2490 + }, + { + "epoch": 5.5432372505543235, + "grad_norm": 1.0329008102416992, + "learning_rate": 0.0002, + "loss": 1.2346, + "step": 2500 + }, + { + "epoch": 5.565410199556541, + "grad_norm": 0.90513014793396, + "learning_rate": 0.0002, + "loss": 1.2202, + "step": 2510 + }, + { + "epoch": 5.587583148558759, + "grad_norm": 1.107680320739746, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 2520 + }, + { + "epoch": 5.609756097560975, + "grad_norm": 0.8842377662658691, + "learning_rate": 0.0002, + "loss": 1.2117, + "step": 2530 + }, + { + "epoch": 5.631929046563193, + "grad_norm": 0.9856716990470886, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 2540 + }, + { + "epoch": 5.65410199556541, + "grad_norm": 1.0363198518753052, + "learning_rate": 0.0002, + "loss": 1.2579, + "step": 2550 + }, + { + "epoch": 5.676274944567627, + "grad_norm": 0.9366242289543152, + "learning_rate": 0.0002, + "loss": 1.236, + "step": 2560 + }, + { + "epoch": 5.698447893569845, + "grad_norm": 0.9180609583854675, + "learning_rate": 0.0002, + "loss": 1.2652, + "step": 2570 + }, + { + "epoch": 5.720620842572062, + "grad_norm": 0.96494460105896, + "learning_rate": 0.0002, + "loss": 1.2153, + "step": 2580 + }, + { + "epoch": 5.74279379157428, + "grad_norm": 1.066856861114502, + "learning_rate": 0.0002, + "loss": 1.2596, + "step": 2590 + }, + { + "epoch": 5.764966740576496, + "grad_norm": 1.0576446056365967, + "learning_rate": 0.0002, + "loss": 1.2437, + "step": 2600 + }, + { + "epoch": 5.787139689578714, + "grad_norm": 1.0688375234603882, + "learning_rate": 0.0002, + "loss": 1.2449, + "step": 2610 + }, + { + "epoch": 5.8093126385809315, + "grad_norm": 0.9294432401657104, + "learning_rate": 0.0002, + "loss": 1.2094, + "step": 2620 + }, + { + "epoch": 5.831485587583149, + "grad_norm": 0.9467836618423462, + "learning_rate": 0.0002, + "loss": 1.3705, + "step": 2630 + }, + { + "epoch": 5.853658536585366, + "grad_norm": 1.1947448253631592, + "learning_rate": 0.0002, + "loss": 1.334, + "step": 2640 + }, + { + "epoch": 5.875831485587583, + "grad_norm": 0.9225861430168152, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 2650 + }, + { + "epoch": 5.898004434589801, + "grad_norm": 0.9499539136886597, + "learning_rate": 0.0002, + "loss": 1.3356, + "step": 2660 + }, + { + "epoch": 5.9201773835920175, + "grad_norm": 0.9666298031806946, + "learning_rate": 0.0002, + "loss": 1.2898, + "step": 2670 + }, + { + "epoch": 5.942350332594235, + "grad_norm": 1.0549718141555786, + "learning_rate": 0.0002, + "loss": 1.1846, + "step": 2680 + }, + { + "epoch": 5.964523281596453, + "grad_norm": 1.1662505865097046, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 2690 + }, + { + "epoch": 5.986696230598669, + "grad_norm": 0.9200838208198547, + "learning_rate": 0.0002, + "loss": 1.2717, + "step": 2700 + }, + { + "epoch": 6.0, + "eval_loss": 2.089076280593872, + "eval_runtime": 95.2405, + "eval_samples_per_second": 5.407, + "eval_steps_per_second": 0.682, + "step": 2706 + }, + { + "epoch": 6.008869179600887, + "grad_norm": 1.0047595500946045, + "learning_rate": 0.0002, + "loss": 1.2085, + "step": 2710 + }, + { + "epoch": 6.031042128603104, + "grad_norm": 1.5315641164779663, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 2720 + }, + { + "epoch": 6.053215077605321, + "grad_norm": 1.2092695236206055, + "learning_rate": 0.0002, + "loss": 1.0955, + "step": 2730 + }, + { + "epoch": 6.075388026607539, + "grad_norm": 1.1834157705307007, + "learning_rate": 0.0002, + "loss": 1.108, + "step": 2740 + }, + { + "epoch": 6.097560975609756, + "grad_norm": 1.2534542083740234, + "learning_rate": 0.0002, + "loss": 1.0148, + "step": 2750 + }, + { + "epoch": 6.119733924611974, + "grad_norm": 1.2898602485656738, + "learning_rate": 0.0002, + "loss": 1.0422, + "step": 2760 + }, + { + "epoch": 6.14190687361419, + "grad_norm": 1.3397172689437866, + "learning_rate": 0.0002, + "loss": 1.0363, + "step": 2770 + }, + { + "epoch": 6.164079822616408, + "grad_norm": 1.18838632106781, + "learning_rate": 0.0002, + "loss": 1.0651, + "step": 2780 + }, + { + "epoch": 6.1862527716186255, + "grad_norm": 1.2524046897888184, + "learning_rate": 0.0002, + "loss": 1.048, + "step": 2790 + }, + { + "epoch": 6.208425720620842, + "grad_norm": 1.3325964212417603, + "learning_rate": 0.0002, + "loss": 1.0799, + "step": 2800 + }, + { + "epoch": 6.23059866962306, + "grad_norm": 1.3972342014312744, + "learning_rate": 0.0002, + "loss": 1.0768, + "step": 2810 + }, + { + "epoch": 6.252771618625277, + "grad_norm": 1.192122220993042, + "learning_rate": 0.0002, + "loss": 1.0822, + "step": 2820 + }, + { + "epoch": 6.274944567627495, + "grad_norm": 1.2018429040908813, + "learning_rate": 0.0002, + "loss": 1.0274, + "step": 2830 + }, + { + "epoch": 6.2971175166297115, + "grad_norm": 1.2017251253128052, + "learning_rate": 0.0002, + "loss": 1.045, + "step": 2840 + }, + { + "epoch": 6.319290465631929, + "grad_norm": 1.070663332939148, + "learning_rate": 0.0002, + "loss": 1.0522, + "step": 2850 + }, + { + "epoch": 6.341463414634147, + "grad_norm": 1.2376646995544434, + "learning_rate": 0.0002, + "loss": 1.1084, + "step": 2860 + }, + { + "epoch": 6.363636363636363, + "grad_norm": 1.4164553880691528, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 2870 + }, + { + "epoch": 6.385809312638581, + "grad_norm": 0.9863289594650269, + "learning_rate": 0.0002, + "loss": 1.0519, + "step": 2880 + }, + { + "epoch": 6.407982261640798, + "grad_norm": 1.1530284881591797, + "learning_rate": 0.0002, + "loss": 0.9746, + "step": 2890 + }, + { + "epoch": 6.430155210643015, + "grad_norm": 1.3614071607589722, + "learning_rate": 0.0002, + "loss": 1.0414, + "step": 2900 + }, + { + "epoch": 6.452328159645233, + "grad_norm": 1.4213203191757202, + "learning_rate": 0.0002, + "loss": 1.1097, + "step": 2910 + }, + { + "epoch": 6.47450110864745, + "grad_norm": 1.3584799766540527, + "learning_rate": 0.0002, + "loss": 1.0551, + "step": 2920 + }, + { + "epoch": 6.496674057649668, + "grad_norm": 1.1774920225143433, + "learning_rate": 0.0002, + "loss": 1.0888, + "step": 2930 + }, + { + "epoch": 6.518847006651884, + "grad_norm": 1.5063673257827759, + "learning_rate": 0.0002, + "loss": 1.0806, + "step": 2940 + }, + { + "epoch": 6.541019955654102, + "grad_norm": 1.3073967695236206, + "learning_rate": 0.0002, + "loss": 1.1157, + "step": 2950 + }, + { + "epoch": 6.5631929046563195, + "grad_norm": 1.2877048254013062, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 2960 + }, + { + "epoch": 6.585365853658536, + "grad_norm": 1.4681131839752197, + "learning_rate": 0.0002, + "loss": 1.0518, + "step": 2970 + }, + { + "epoch": 6.607538802660754, + "grad_norm": 1.364174246788025, + "learning_rate": 0.0002, + "loss": 1.1336, + "step": 2980 + }, + { + "epoch": 6.629711751662971, + "grad_norm": 1.3069559335708618, + "learning_rate": 0.0002, + "loss": 1.045, + "step": 2990 + }, + { + "epoch": 6.651884700665189, + "grad_norm": 1.152112364768982, + "learning_rate": 0.0002, + "loss": 1.059, + "step": 3000 + }, + { + "epoch": 6.674057649667406, + "grad_norm": 1.3854167461395264, + "learning_rate": 0.0002, + "loss": 1.1065, + "step": 3010 + }, + { + "epoch": 6.696230598669623, + "grad_norm": 1.3519569635391235, + "learning_rate": 0.0002, + "loss": 1.0792, + "step": 3020 + }, + { + "epoch": 6.718403547671841, + "grad_norm": 1.253912091255188, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 3030 + }, + { + "epoch": 6.740576496674057, + "grad_norm": 1.3960589170455933, + "learning_rate": 0.0002, + "loss": 1.0902, + "step": 3040 + }, + { + "epoch": 6.762749445676275, + "grad_norm": 1.3538455963134766, + "learning_rate": 0.0002, + "loss": 1.1028, + "step": 3050 + }, + { + "epoch": 6.7849223946784925, + "grad_norm": 1.1728484630584717, + "learning_rate": 0.0002, + "loss": 1.1072, + "step": 3060 + }, + { + "epoch": 6.807095343680709, + "grad_norm": 1.2287765741348267, + "learning_rate": 0.0002, + "loss": 1.138, + "step": 3070 + }, + { + "epoch": 6.829268292682927, + "grad_norm": 1.2122321128845215, + "learning_rate": 0.0002, + "loss": 1.0952, + "step": 3080 + }, + { + "epoch": 6.851441241685144, + "grad_norm": 1.3517614603042603, + "learning_rate": 0.0002, + "loss": 1.1051, + "step": 3090 + }, + { + "epoch": 6.873614190687362, + "grad_norm": 1.186508059501648, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 3100 + }, + { + "epoch": 6.8957871396895785, + "grad_norm": 1.2658056020736694, + "learning_rate": 0.0002, + "loss": 1.1307, + "step": 3110 + }, + { + "epoch": 6.917960088691796, + "grad_norm": 1.0459643602371216, + "learning_rate": 0.0002, + "loss": 1.0814, + "step": 3120 + }, + { + "epoch": 6.940133037694014, + "grad_norm": 1.1218708753585815, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 3130 + }, + { + "epoch": 6.96230598669623, + "grad_norm": 1.1161539554595947, + "learning_rate": 0.0002, + "loss": 1.0851, + "step": 3140 + }, + { + "epoch": 6.984478935698448, + "grad_norm": 1.312601923942566, + "learning_rate": 0.0002, + "loss": 1.1627, + "step": 3150 + }, + { + "epoch": 7.0, + "eval_loss": 2.216700315475464, + "eval_runtime": 132.441, + "eval_samples_per_second": 3.889, + "eval_steps_per_second": 0.491, + "step": 3157 + } + ], + "logging_steps": 10, + "max_steps": 3608, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.46098891190698e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcdc3b48752889d03e0bc8f748c28d3a3eeb1026 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3157/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4002b3199d379e5c6c8494417ad0ab221d2f73797ae8bbbb3b91f811953dbe5c +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4978e8b82b979bd7001720085d8c2aba4792ef30 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5300288ed81cb8705565fa5fcf1c2b9923f94570353881b0018a775d6d13f040 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fde7aa1d275bb677847d1b7d114492f0b11cc2c8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90ba185f1e74b61e63d32f412e23cfe02865c8146c8cb5fcea567f2620ed83e2 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0f1d7382ea9152ab01fda898dd2c5a0808f68013 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f94fe8c6ef9e37058a88748c880a8e256a3f0b805961fb38235ab692884f0d3f +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce4045970697fba1f32f37a072e5de304aff384c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b4a51ddb1167dfd7ec78cb62fd78a381edf2a4e3e0cea221771a3f0d1c9d03 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..00a77eead46be1d690efc0a77ff999144828c682 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/trainer_state.json @@ -0,0 +1,2617 @@ +{ + "best_metric": 1.8243104219436646, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902", + "epoch": 8.0, + "eval_steps": 10, + "global_step": 3608, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022172949002217297, + "grad_norm": 0.6454975008964539, + "learning_rate": 0.0002, + "loss": 2.684, + "step": 10 + }, + { + "epoch": 0.04434589800443459, + "grad_norm": 0.5452715158462524, + "learning_rate": 0.0002, + "loss": 2.2694, + "step": 20 + }, + { + "epoch": 0.06651884700665188, + "grad_norm": 0.5502195358276367, + "learning_rate": 0.0002, + "loss": 2.0796, + "step": 30 + }, + { + "epoch": 0.08869179600886919, + "grad_norm": 0.48551198840141296, + "learning_rate": 0.0002, + "loss": 1.9132, + "step": 40 + }, + { + "epoch": 0.11086474501108648, + "grad_norm": 0.47822514176368713, + "learning_rate": 0.0002, + "loss": 2.016, + "step": 50 + }, + { + "epoch": 0.13303769401330376, + "grad_norm": 0.5125395655632019, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 60 + }, + { + "epoch": 0.15521064301552107, + "grad_norm": 0.4600693881511688, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 70 + }, + { + "epoch": 0.17738359201773837, + "grad_norm": 0.7180814743041992, + "learning_rate": 0.0002, + "loss": 2.0057, + "step": 80 + }, + { + "epoch": 0.19955654101995565, + "grad_norm": 0.4712974429130554, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 90 + }, + { + "epoch": 0.22172949002217296, + "grad_norm": 0.4673261344432831, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 100 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 0.4129070043563843, + "learning_rate": 0.0002, + "loss": 1.9346, + "step": 110 + }, + { + "epoch": 0.2660753880266075, + "grad_norm": 0.3859104812145233, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 120 + }, + { + "epoch": 0.28824833702882485, + "grad_norm": 0.40966713428497314, + "learning_rate": 0.0002, + "loss": 1.8922, + "step": 130 + }, + { + "epoch": 0.31042128603104213, + "grad_norm": 0.3685867488384247, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 140 + }, + { + "epoch": 0.3325942350332594, + "grad_norm": 0.39279988408088684, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 150 + }, + { + "epoch": 0.35476718403547675, + "grad_norm": 0.4195398986339569, + "learning_rate": 0.0002, + "loss": 1.8556, + "step": 160 + }, + { + "epoch": 0.376940133037694, + "grad_norm": 0.469802588224411, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 170 + }, + { + "epoch": 0.3991130820399113, + "grad_norm": 0.4069509208202362, + "learning_rate": 0.0002, + "loss": 1.8135, + "step": 180 + }, + { + "epoch": 0.4212860310421286, + "grad_norm": 0.47832027077674866, + "learning_rate": 0.0002, + "loss": 1.8429, + "step": 190 + }, + { + "epoch": 0.4434589800443459, + "grad_norm": 0.3376411199569702, + "learning_rate": 0.0002, + "loss": 1.781, + "step": 200 + }, + { + "epoch": 0.4656319290465632, + "grad_norm": 0.3787185847759247, + "learning_rate": 0.0002, + "loss": 1.8562, + "step": 210 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 0.40322697162628174, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 220 + }, + { + "epoch": 0.5099778270509978, + "grad_norm": 0.3710436522960663, + "learning_rate": 0.0002, + "loss": 1.8007, + "step": 230 + }, + { + "epoch": 0.532150776053215, + "grad_norm": 0.3723200261592865, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 240 + }, + { + "epoch": 0.5543237250554324, + "grad_norm": 0.3457179069519043, + "learning_rate": 0.0002, + "loss": 1.852, + "step": 250 + }, + { + "epoch": 0.5764966740576497, + "grad_norm": 0.35369473695755005, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 260 + }, + { + "epoch": 0.5986696230598669, + "grad_norm": 0.3667483329772949, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 270 + }, + { + "epoch": 0.6208425720620843, + "grad_norm": 0.4023273289203644, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 280 + }, + { + "epoch": 0.6430155210643016, + "grad_norm": 0.3601929843425751, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 290 + }, + { + "epoch": 0.6651884700665188, + "grad_norm": 0.32610392570495605, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 300 + }, + { + "epoch": 0.6873614190687362, + "grad_norm": 0.40528756380081177, + "learning_rate": 0.0002, + "loss": 1.8412, + "step": 310 + }, + { + "epoch": 0.7095343680709535, + "grad_norm": 0.34639739990234375, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 320 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 0.3794991374015808, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 330 + }, + { + "epoch": 0.753880266075388, + "grad_norm": 0.34203875064849854, + "learning_rate": 0.0002, + "loss": 1.8678, + "step": 340 + }, + { + "epoch": 0.7760532150776053, + "grad_norm": 0.36692821979522705, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 350 + }, + { + "epoch": 0.7982261640798226, + "grad_norm": 0.3701125979423523, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 360 + }, + { + "epoch": 0.8203991130820399, + "grad_norm": 0.3971416652202606, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 370 + }, + { + "epoch": 0.8425720620842572, + "grad_norm": 0.3751989006996155, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 380 + }, + { + "epoch": 0.8647450110864745, + "grad_norm": 0.35116496682167053, + "learning_rate": 0.0002, + "loss": 1.8281, + "step": 390 + }, + { + "epoch": 0.8869179600886918, + "grad_norm": 0.3672674894332886, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 400 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.34648260474205017, + "learning_rate": 0.0002, + "loss": 1.8293, + "step": 410 + }, + { + "epoch": 0.9312638580931264, + "grad_norm": 0.4497389793395996, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 420 + }, + { + "epoch": 0.9534368070953437, + "grad_norm": 0.33595147728919983, + "learning_rate": 0.0002, + "loss": 1.8405, + "step": 430 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 0.3130456805229187, + "learning_rate": 0.0002, + "loss": 1.8137, + "step": 440 + }, + { + "epoch": 0.9977827050997783, + "grad_norm": 0.36480239033699036, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8323718309402466, + "eval_runtime": 79.9603, + "eval_samples_per_second": 6.441, + "eval_steps_per_second": 0.813, + "step": 451 + }, + { + "epoch": 1.0199556541019956, + "grad_norm": 0.3840029835700989, + "learning_rate": 0.0002, + "loss": 1.7921, + "step": 460 + }, + { + "epoch": 1.042128603104213, + "grad_norm": 0.33457425236701965, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 470 + }, + { + "epoch": 1.06430155210643, + "grad_norm": 0.35766592621803284, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 480 + }, + { + "epoch": 1.0864745011086474, + "grad_norm": 0.38070937991142273, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 490 + }, + { + "epoch": 1.1086474501108647, + "grad_norm": 0.38546547293663025, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 500 + }, + { + "epoch": 1.130820399113082, + "grad_norm": 0.384104460477829, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 510 + }, + { + "epoch": 1.1529933481152994, + "grad_norm": 0.3556116819381714, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 520 + }, + { + "epoch": 1.1751662971175167, + "grad_norm": 0.4110541343688965, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 530 + }, + { + "epoch": 1.1973392461197339, + "grad_norm": 0.46503177285194397, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 540 + }, + { + "epoch": 1.2195121951219512, + "grad_norm": 0.4366816580295563, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 550 + }, + { + "epoch": 1.2416851441241685, + "grad_norm": 0.379986047744751, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 560 + }, + { + "epoch": 1.2638580931263859, + "grad_norm": 0.3920869529247284, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 570 + }, + { + "epoch": 1.2860310421286032, + "grad_norm": 0.4013986587524414, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 580 + }, + { + "epoch": 1.3082039911308203, + "grad_norm": 0.39104390144348145, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 590 + }, + { + "epoch": 1.3303769401330376, + "grad_norm": 0.40515613555908203, + "learning_rate": 0.0002, + "loss": 1.7822, + "step": 600 + }, + { + "epoch": 1.352549889135255, + "grad_norm": 0.4212331473827362, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 610 + }, + { + "epoch": 1.3747228381374723, + "grad_norm": 0.36040815711021423, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 620 + }, + { + "epoch": 1.3968957871396896, + "grad_norm": 0.3950865864753723, + "learning_rate": 0.0002, + "loss": 1.7467, + "step": 630 + }, + { + "epoch": 1.4190687361419068, + "grad_norm": 0.3934709131717682, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 640 + }, + { + "epoch": 1.441241685144124, + "grad_norm": 0.3905350863933563, + "learning_rate": 0.0002, + "loss": 1.7783, + "step": 650 + }, + { + "epoch": 1.4634146341463414, + "grad_norm": 0.4322686493396759, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 660 + }, + { + "epoch": 1.4855875831485588, + "grad_norm": 0.35697034001350403, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 670 + }, + { + "epoch": 1.507760532150776, + "grad_norm": 0.38570451736450195, + "learning_rate": 0.0002, + "loss": 1.6764, + "step": 680 + }, + { + "epoch": 1.5299334811529932, + "grad_norm": 0.3804517090320587, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 690 + }, + { + "epoch": 1.5521064301552108, + "grad_norm": 0.4938165247440338, + "learning_rate": 0.0002, + "loss": 1.7725, + "step": 700 + }, + { + "epoch": 1.5742793791574279, + "grad_norm": 0.43075236678123474, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 710 + }, + { + "epoch": 1.5964523281596452, + "grad_norm": 0.40434643626213074, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 720 + }, + { + "epoch": 1.6186252771618626, + "grad_norm": 0.3874157667160034, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 730 + }, + { + "epoch": 1.6407982261640797, + "grad_norm": 0.3645969331264496, + "learning_rate": 0.0002, + "loss": 1.8141, + "step": 740 + }, + { + "epoch": 1.6629711751662972, + "grad_norm": 0.38588255643844604, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 750 + }, + { + "epoch": 1.6851441241685143, + "grad_norm": 0.39252519607543945, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 760 + }, + { + "epoch": 1.7073170731707317, + "grad_norm": 0.36165162920951843, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 770 + }, + { + "epoch": 1.729490022172949, + "grad_norm": 0.36677947640419006, + "learning_rate": 0.0002, + "loss": 1.7172, + "step": 780 + }, + { + "epoch": 1.7516629711751663, + "grad_norm": 0.374881774187088, + "learning_rate": 0.0002, + "loss": 1.7737, + "step": 790 + }, + { + "epoch": 1.7738359201773837, + "grad_norm": 0.4530802369117737, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 800 + }, + { + "epoch": 1.7960088691796008, + "grad_norm": 0.3879568576812744, + "learning_rate": 0.0002, + "loss": 1.7799, + "step": 810 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.3710079789161682, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 820 + }, + { + "epoch": 1.8403547671840355, + "grad_norm": 0.3831799030303955, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 830 + }, + { + "epoch": 1.8625277161862528, + "grad_norm": 0.3958432376384735, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 840 + }, + { + "epoch": 1.8847006651884701, + "grad_norm": 0.4129294157028198, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 850 + }, + { + "epoch": 1.9068736141906872, + "grad_norm": 0.3714745044708252, + "learning_rate": 0.0002, + "loss": 1.6506, + "step": 860 + }, + { + "epoch": 1.9290465631929048, + "grad_norm": 0.40176868438720703, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 870 + }, + { + "epoch": 1.951219512195122, + "grad_norm": 0.36937767267227173, + "learning_rate": 0.0002, + "loss": 1.7557, + "step": 880 + }, + { + "epoch": 1.9733924611973392, + "grad_norm": 0.40242597460746765, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 890 + }, + { + "epoch": 1.9955654101995566, + "grad_norm": 0.3515510559082031, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 900 + }, + { + "epoch": 2.0, + "eval_loss": 1.8243104219436646, + "eval_runtime": 107.8856, + "eval_samples_per_second": 4.774, + "eval_steps_per_second": 0.602, + "step": 902 + }, + { + "epoch": 2.0177383592017737, + "grad_norm": 0.4145216643810272, + "learning_rate": 0.0002, + "loss": 1.6631, + "step": 910 + }, + { + "epoch": 2.0399113082039912, + "grad_norm": 0.42423519492149353, + "learning_rate": 0.0002, + "loss": 1.7227, + "step": 920 + }, + { + "epoch": 2.0620842572062084, + "grad_norm": 0.4773229956626892, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 930 + }, + { + "epoch": 2.084257206208426, + "grad_norm": 0.4144791066646576, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 940 + }, + { + "epoch": 2.106430155210643, + "grad_norm": 0.42704132199287415, + "learning_rate": 0.0002, + "loss": 1.6433, + "step": 950 + }, + { + "epoch": 2.12860310421286, + "grad_norm": 0.4479042589664459, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 960 + }, + { + "epoch": 2.1507760532150777, + "grad_norm": 0.4810638129711151, + "learning_rate": 0.0002, + "loss": 1.6122, + "step": 970 + }, + { + "epoch": 2.172949002217295, + "grad_norm": 0.48669910430908203, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 980 + }, + { + "epoch": 2.1951219512195124, + "grad_norm": 0.4252761900424957, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 990 + }, + { + "epoch": 2.2172949002217295, + "grad_norm": 0.42342790961265564, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 1000 + }, + { + "epoch": 2.2394678492239466, + "grad_norm": 0.43432456254959106, + "learning_rate": 0.0002, + "loss": 1.637, + "step": 1010 + }, + { + "epoch": 2.261640798226164, + "grad_norm": 0.45556965470314026, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 1020 + }, + { + "epoch": 2.2838137472283813, + "grad_norm": 0.48035719990730286, + "learning_rate": 0.0002, + "loss": 1.6554, + "step": 1030 + }, + { + "epoch": 2.305986696230599, + "grad_norm": 0.4233241081237793, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 1040 + }, + { + "epoch": 2.328159645232816, + "grad_norm": 0.3918434679508209, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 1050 + }, + { + "epoch": 2.3503325942350335, + "grad_norm": 0.44049757719039917, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 1060 + }, + { + "epoch": 2.3725055432372506, + "grad_norm": 0.4730056822299957, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 1070 + }, + { + "epoch": 2.3946784922394677, + "grad_norm": 0.4354589581489563, + "learning_rate": 0.0002, + "loss": 1.6104, + "step": 1080 + }, + { + "epoch": 2.4168514412416853, + "grad_norm": 0.4837590456008911, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 1090 + }, + { + "epoch": 2.4390243902439024, + "grad_norm": 0.4842571020126343, + "learning_rate": 0.0002, + "loss": 1.6672, + "step": 1100 + }, + { + "epoch": 2.4611973392461195, + "grad_norm": 0.46398279070854187, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 1110 + }, + { + "epoch": 2.483370288248337, + "grad_norm": 0.4587327539920807, + "learning_rate": 0.0002, + "loss": 1.6393, + "step": 1120 + }, + { + "epoch": 2.505543237250554, + "grad_norm": 0.4336528480052948, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 1130 + }, + { + "epoch": 2.5277161862527717, + "grad_norm": 0.6162153482437134, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1140 + }, + { + "epoch": 2.549889135254989, + "grad_norm": 0.48175573348999023, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1150 + }, + { + "epoch": 2.5720620842572064, + "grad_norm": 0.448272705078125, + "learning_rate": 0.0002, + "loss": 1.6098, + "step": 1160 + }, + { + "epoch": 2.5942350332594235, + "grad_norm": 0.5189200639724731, + "learning_rate": 0.0002, + "loss": 1.6987, + "step": 1170 + }, + { + "epoch": 2.6164079822616406, + "grad_norm": 0.45032963156700134, + "learning_rate": 0.0002, + "loss": 1.6503, + "step": 1180 + }, + { + "epoch": 2.638580931263858, + "grad_norm": 0.4417729377746582, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 1190 + }, + { + "epoch": 2.6607538802660753, + "grad_norm": 0.5219636559486389, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 1200 + }, + { + "epoch": 2.682926829268293, + "grad_norm": 0.47702011466026306, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 1210 + }, + { + "epoch": 2.70509977827051, + "grad_norm": 0.4328458607196808, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1220 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.46762076020240784, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 1230 + }, + { + "epoch": 2.7494456762749446, + "grad_norm": 0.4592697322368622, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 1240 + }, + { + "epoch": 2.7716186252771617, + "grad_norm": 0.5519265532493591, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 1250 + }, + { + "epoch": 2.7937915742793793, + "grad_norm": 0.47169506549835205, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1260 + }, + { + "epoch": 2.8159645232815964, + "grad_norm": 0.47231653332710266, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 1270 + }, + { + "epoch": 2.8381374722838135, + "grad_norm": 0.49081969261169434, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 1280 + }, + { + "epoch": 2.860310421286031, + "grad_norm": 0.4483231008052826, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 1290 + }, + { + "epoch": 2.882483370288248, + "grad_norm": 0.5310035943984985, + "learning_rate": 0.0002, + "loss": 1.6428, + "step": 1300 + }, + { + "epoch": 2.9046563192904657, + "grad_norm": 0.4419795572757721, + "learning_rate": 0.0002, + "loss": 1.6515, + "step": 1310 + }, + { + "epoch": 2.926829268292683, + "grad_norm": 0.44630762934684753, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 1320 + }, + { + "epoch": 2.9490022172949004, + "grad_norm": 0.39774850010871887, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 1330 + }, + { + "epoch": 2.9711751662971175, + "grad_norm": 0.441727876663208, + "learning_rate": 0.0002, + "loss": 1.7007, + "step": 1340 + }, + { + "epoch": 2.9933481152993346, + "grad_norm": 0.43773892521858215, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 1350 + }, + { + "epoch": 3.0, + "eval_loss": 1.8514760732650757, + "eval_runtime": 131.9812, + "eval_samples_per_second": 3.902, + "eval_steps_per_second": 0.492, + "step": 1353 + }, + { + "epoch": 3.015521064301552, + "grad_norm": 0.5274150371551514, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 1360 + }, + { + "epoch": 3.0376940133037693, + "grad_norm": 0.5724489092826843, + "learning_rate": 0.0002, + "loss": 1.4784, + "step": 1370 + }, + { + "epoch": 3.059866962305987, + "grad_norm": 0.6182316541671753, + "learning_rate": 0.0002, + "loss": 1.5365, + "step": 1380 + }, + { + "epoch": 3.082039911308204, + "grad_norm": 0.5709688067436218, + "learning_rate": 0.0002, + "loss": 1.4824, + "step": 1390 + }, + { + "epoch": 3.104212860310421, + "grad_norm": 0.6368464231491089, + "learning_rate": 0.0002, + "loss": 1.534, + "step": 1400 + }, + { + "epoch": 3.1263858093126387, + "grad_norm": 0.5680432319641113, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 1410 + }, + { + "epoch": 3.1485587583148558, + "grad_norm": 0.5805315375328064, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 1420 + }, + { + "epoch": 3.1707317073170733, + "grad_norm": 0.5782836675643921, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 1430 + }, + { + "epoch": 3.1929046563192904, + "grad_norm": 0.627159595489502, + "learning_rate": 0.0002, + "loss": 1.4852, + "step": 1440 + }, + { + "epoch": 3.2150776053215075, + "grad_norm": 0.6136751174926758, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 1450 + }, + { + "epoch": 3.237250554323725, + "grad_norm": 0.6319093108177185, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 1460 + }, + { + "epoch": 3.259423503325942, + "grad_norm": 0.7641780972480774, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 1470 + }, + { + "epoch": 3.2815964523281598, + "grad_norm": 0.6116001605987549, + "learning_rate": 0.0002, + "loss": 1.5514, + "step": 1480 + }, + { + "epoch": 3.303769401330377, + "grad_norm": 0.6024722456932068, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 1490 + }, + { + "epoch": 3.3259423503325944, + "grad_norm": 0.5941570997238159, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 1500 + }, + { + "epoch": 3.3481152993348116, + "grad_norm": 0.608369767665863, + "learning_rate": 0.0002, + "loss": 1.5104, + "step": 1510 + }, + { + "epoch": 3.3702882483370287, + "grad_norm": 0.5942065715789795, + "learning_rate": 0.0002, + "loss": 1.5494, + "step": 1520 + }, + { + "epoch": 3.3924611973392462, + "grad_norm": 0.6382330656051636, + "learning_rate": 0.0002, + "loss": 1.5426, + "step": 1530 + }, + { + "epoch": 3.4146341463414633, + "grad_norm": 0.5839648842811584, + "learning_rate": 0.0002, + "loss": 1.5479, + "step": 1540 + }, + { + "epoch": 3.436807095343681, + "grad_norm": 0.5627358555793762, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 1550 + }, + { + "epoch": 3.458980044345898, + "grad_norm": 0.6342151761054993, + "learning_rate": 0.0002, + "loss": 1.5679, + "step": 1560 + }, + { + "epoch": 3.481152993348115, + "grad_norm": 0.6370542645454407, + "learning_rate": 0.0002, + "loss": 1.5005, + "step": 1570 + }, + { + "epoch": 3.5033259423503327, + "grad_norm": 0.5974680185317993, + "learning_rate": 0.0002, + "loss": 1.541, + "step": 1580 + }, + { + "epoch": 3.52549889135255, + "grad_norm": 0.6197021007537842, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 1590 + }, + { + "epoch": 3.5476718403547673, + "grad_norm": 0.6413024067878723, + "learning_rate": 0.0002, + "loss": 1.5287, + "step": 1600 + }, + { + "epoch": 3.5698447893569845, + "grad_norm": 0.5878410339355469, + "learning_rate": 0.0002, + "loss": 1.5301, + "step": 1610 + }, + { + "epoch": 3.5920177383592016, + "grad_norm": 0.6485083103179932, + "learning_rate": 0.0002, + "loss": 1.4625, + "step": 1620 + }, + { + "epoch": 3.614190687361419, + "grad_norm": 0.5826634764671326, + "learning_rate": 0.0002, + "loss": 1.5373, + "step": 1630 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.8906663656234741, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 1640 + }, + { + "epoch": 3.658536585365854, + "grad_norm": 0.6288479566574097, + "learning_rate": 0.0002, + "loss": 1.5208, + "step": 1650 + }, + { + "epoch": 3.680709534368071, + "grad_norm": 0.6191049218177795, + "learning_rate": 0.0002, + "loss": 1.6086, + "step": 1660 + }, + { + "epoch": 3.7028824833702885, + "grad_norm": 0.5997978448867798, + "learning_rate": 0.0002, + "loss": 1.5043, + "step": 1670 + }, + { + "epoch": 3.7250554323725056, + "grad_norm": 0.6003038287162781, + "learning_rate": 0.0002, + "loss": 1.5654, + "step": 1680 + }, + { + "epoch": 3.7472283813747227, + "grad_norm": 0.5417194962501526, + "learning_rate": 0.0002, + "loss": 1.4941, + "step": 1690 + }, + { + "epoch": 3.7694013303769403, + "grad_norm": 0.6367442607879639, + "learning_rate": 0.0002, + "loss": 1.5541, + "step": 1700 + }, + { + "epoch": 3.7915742793791574, + "grad_norm": 0.6613120436668396, + "learning_rate": 0.0002, + "loss": 1.5483, + "step": 1710 + }, + { + "epoch": 3.8137472283813745, + "grad_norm": 0.6506749391555786, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 1720 + }, + { + "epoch": 3.835920177383592, + "grad_norm": 0.5478500723838806, + "learning_rate": 0.0002, + "loss": 1.5207, + "step": 1730 + }, + { + "epoch": 3.858093126385809, + "grad_norm": 0.7313215732574463, + "learning_rate": 0.0002, + "loss": 1.5619, + "step": 1740 + }, + { + "epoch": 3.8802660753880267, + "grad_norm": 0.5453857183456421, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 1750 + }, + { + "epoch": 3.902439024390244, + "grad_norm": 0.5983547568321228, + "learning_rate": 0.0002, + "loss": 1.4857, + "step": 1760 + }, + { + "epoch": 3.9246119733924614, + "grad_norm": 0.6471580266952515, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 1770 + }, + { + "epoch": 3.9467849223946785, + "grad_norm": 0.5833685398101807, + "learning_rate": 0.0002, + "loss": 1.461, + "step": 1780 + }, + { + "epoch": 3.9689578713968956, + "grad_norm": 0.5509327054023743, + "learning_rate": 0.0002, + "loss": 1.5014, + "step": 1790 + }, + { + "epoch": 3.991130820399113, + "grad_norm": 0.6021352410316467, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 1800 + }, + { + "epoch": 4.0, + "eval_loss": 1.901047945022583, + "eval_runtime": 82.2708, + "eval_samples_per_second": 6.26, + "eval_steps_per_second": 0.79, + "step": 1804 + }, + { + "epoch": 4.013303769401331, + "grad_norm": 0.6232016682624817, + "learning_rate": 0.0002, + "loss": 1.422, + "step": 1810 + }, + { + "epoch": 4.035476718403547, + "grad_norm": 0.7521207928657532, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 1820 + }, + { + "epoch": 4.057649667405765, + "grad_norm": 0.7839062213897705, + "learning_rate": 0.0002, + "loss": 1.4481, + "step": 1830 + }, + { + "epoch": 4.0798226164079825, + "grad_norm": 0.8654165863990784, + "learning_rate": 0.0002, + "loss": 1.4147, + "step": 1840 + }, + { + "epoch": 4.101995565410199, + "grad_norm": 0.6872738599777222, + "learning_rate": 0.0002, + "loss": 1.2983, + "step": 1850 + }, + { + "epoch": 4.124168514412417, + "grad_norm": 0.7529677748680115, + "learning_rate": 0.0002, + "loss": 1.3115, + "step": 1860 + }, + { + "epoch": 4.146341463414634, + "grad_norm": 0.835027277469635, + "learning_rate": 0.0002, + "loss": 1.3869, + "step": 1870 + }, + { + "epoch": 4.168514412416852, + "grad_norm": 0.7457721829414368, + "learning_rate": 0.0002, + "loss": 1.3273, + "step": 1880 + }, + { + "epoch": 4.1906873614190685, + "grad_norm": 0.7366040349006653, + "learning_rate": 0.0002, + "loss": 1.2893, + "step": 1890 + }, + { + "epoch": 4.212860310421286, + "grad_norm": 0.7802833914756775, + "learning_rate": 0.0002, + "loss": 1.3615, + "step": 1900 + }, + { + "epoch": 4.235033259423504, + "grad_norm": 0.7526614665985107, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 1910 + }, + { + "epoch": 4.25720620842572, + "grad_norm": 0.7531310319900513, + "learning_rate": 0.0002, + "loss": 1.4384, + "step": 1920 + }, + { + "epoch": 4.279379157427938, + "grad_norm": 0.8899626135826111, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 1930 + }, + { + "epoch": 4.301552106430155, + "grad_norm": 0.7591356635093689, + "learning_rate": 0.0002, + "loss": 1.328, + "step": 1940 + }, + { + "epoch": 4.323725055432373, + "grad_norm": 0.7126884460449219, + "learning_rate": 0.0002, + "loss": 1.4114, + "step": 1950 + }, + { + "epoch": 4.34589800443459, + "grad_norm": 0.7907777428627014, + "learning_rate": 0.0002, + "loss": 1.4259, + "step": 1960 + }, + { + "epoch": 4.368070953436807, + "grad_norm": 0.7854869961738586, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 1970 + }, + { + "epoch": 4.390243902439025, + "grad_norm": 0.6982123851776123, + "learning_rate": 0.0002, + "loss": 1.4126, + "step": 1980 + }, + { + "epoch": 4.412416851441241, + "grad_norm": 0.7551925182342529, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 1990 + }, + { + "epoch": 4.434589800443459, + "grad_norm": 0.864078164100647, + "learning_rate": 0.0002, + "loss": 1.4551, + "step": 2000 + }, + { + "epoch": 4.4567627494456765, + "grad_norm": 0.8406776189804077, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 2010 + }, + { + "epoch": 4.478935698447893, + "grad_norm": 0.7706766724586487, + "learning_rate": 0.0002, + "loss": 1.3543, + "step": 2020 + }, + { + "epoch": 4.501108647450111, + "grad_norm": 0.7703949213027954, + "learning_rate": 0.0002, + "loss": 1.386, + "step": 2030 + }, + { + "epoch": 4.523281596452328, + "grad_norm": 0.8654166460037231, + "learning_rate": 0.0002, + "loss": 1.4059, + "step": 2040 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.7800114750862122, + "learning_rate": 0.0002, + "loss": 1.4067, + "step": 2050 + }, + { + "epoch": 4.5676274944567625, + "grad_norm": 0.7553898692131042, + "learning_rate": 0.0002, + "loss": 1.3578, + "step": 2060 + }, + { + "epoch": 4.58980044345898, + "grad_norm": 0.8689188957214355, + "learning_rate": 0.0002, + "loss": 1.3845, + "step": 2070 + }, + { + "epoch": 4.611973392461198, + "grad_norm": 0.7244092226028442, + "learning_rate": 0.0002, + "loss": 1.3851, + "step": 2080 + }, + { + "epoch": 4.634146341463414, + "grad_norm": 0.9829743504524231, + "learning_rate": 0.0002, + "loss": 1.3627, + "step": 2090 + }, + { + "epoch": 4.656319290465632, + "grad_norm": 0.8026102185249329, + "learning_rate": 0.0002, + "loss": 1.4059, + "step": 2100 + }, + { + "epoch": 4.678492239467849, + "grad_norm": 0.6725143194198608, + "learning_rate": 0.0002, + "loss": 1.3676, + "step": 2110 + }, + { + "epoch": 4.700665188470067, + "grad_norm": 0.8055245876312256, + "learning_rate": 0.0002, + "loss": 1.4669, + "step": 2120 + }, + { + "epoch": 4.722838137472284, + "grad_norm": 0.7507025003433228, + "learning_rate": 0.0002, + "loss": 1.4455, + "step": 2130 + }, + { + "epoch": 4.745011086474501, + "grad_norm": 0.7166216969490051, + "learning_rate": 0.0002, + "loss": 1.3974, + "step": 2140 + }, + { + "epoch": 4.767184035476719, + "grad_norm": 0.6826853156089783, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 2150 + }, + { + "epoch": 4.789356984478935, + "grad_norm": 1.1347891092300415, + "learning_rate": 0.0002, + "loss": 1.3907, + "step": 2160 + }, + { + "epoch": 4.811529933481153, + "grad_norm": 0.8205971121788025, + "learning_rate": 0.0002, + "loss": 1.3737, + "step": 2170 + }, + { + "epoch": 4.8337028824833705, + "grad_norm": 0.7861950397491455, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 2180 + }, + { + "epoch": 4.855875831485587, + "grad_norm": 0.839460551738739, + "learning_rate": 0.0002, + "loss": 1.4293, + "step": 2190 + }, + { + "epoch": 4.878048780487805, + "grad_norm": 0.746583878993988, + "learning_rate": 0.0002, + "loss": 1.3881, + "step": 2200 + }, + { + "epoch": 4.900221729490022, + "grad_norm": 0.7805684804916382, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 2210 + }, + { + "epoch": 4.922394678492239, + "grad_norm": 0.8079700469970703, + "learning_rate": 0.0002, + "loss": 1.4053, + "step": 2220 + }, + { + "epoch": 4.9445676274944566, + "grad_norm": 0.7609502673149109, + "learning_rate": 0.0002, + "loss": 1.353, + "step": 2230 + }, + { + "epoch": 4.966740576496674, + "grad_norm": 0.7862996459007263, + "learning_rate": 0.0002, + "loss": 1.3816, + "step": 2240 + }, + { + "epoch": 4.988913525498892, + "grad_norm": 0.778677225112915, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 2250 + }, + { + "epoch": 5.0, + "eval_loss": 1.9658271074295044, + "eval_runtime": 108.3717, + "eval_samples_per_second": 4.752, + "eval_steps_per_second": 0.6, + "step": 2255 + }, + { + "epoch": 5.011086474501108, + "grad_norm": 0.7520418167114258, + "learning_rate": 0.0002, + "loss": 1.3395, + "step": 2260 + }, + { + "epoch": 5.033259423503326, + "grad_norm": 1.1831114292144775, + "learning_rate": 0.0002, + "loss": 1.1909, + "step": 2270 + }, + { + "epoch": 5.0554323725055434, + "grad_norm": 0.8718661069869995, + "learning_rate": 0.0002, + "loss": 1.1784, + "step": 2280 + }, + { + "epoch": 5.07760532150776, + "grad_norm": 1.0186705589294434, + "learning_rate": 0.0002, + "loss": 1.2208, + "step": 2290 + }, + { + "epoch": 5.099778270509978, + "grad_norm": 1.0370045900344849, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 2300 + }, + { + "epoch": 5.121951219512195, + "grad_norm": 0.9448253512382507, + "learning_rate": 0.0002, + "loss": 1.1485, + "step": 2310 + }, + { + "epoch": 5.144124168514413, + "grad_norm": 0.988973081111908, + "learning_rate": 0.0002, + "loss": 1.1764, + "step": 2320 + }, + { + "epoch": 5.1662971175166295, + "grad_norm": 0.9368142485618591, + "learning_rate": 0.0002, + "loss": 1.1544, + "step": 2330 + }, + { + "epoch": 5.188470066518847, + "grad_norm": 1.0289298295974731, + "learning_rate": 0.0002, + "loss": 1.2416, + "step": 2340 + }, + { + "epoch": 5.210643015521065, + "grad_norm": 0.9611420035362244, + "learning_rate": 0.0002, + "loss": 1.1982, + "step": 2350 + }, + { + "epoch": 5.232815964523281, + "grad_norm": 0.8490312099456787, + "learning_rate": 0.0002, + "loss": 1.2046, + "step": 2360 + }, + { + "epoch": 5.254988913525499, + "grad_norm": 1.0165891647338867, + "learning_rate": 0.0002, + "loss": 1.2504, + "step": 2370 + }, + { + "epoch": 5.277161862527716, + "grad_norm": 0.9902606010437012, + "learning_rate": 0.0002, + "loss": 1.2749, + "step": 2380 + }, + { + "epoch": 5.299334811529933, + "grad_norm": 0.987205445766449, + "learning_rate": 0.0002, + "loss": 1.2186, + "step": 2390 + }, + { + "epoch": 5.321507760532151, + "grad_norm": 0.7931132316589355, + "learning_rate": 0.0002, + "loss": 1.1962, + "step": 2400 + }, + { + "epoch": 5.343680709534368, + "grad_norm": 1.143110990524292, + "learning_rate": 0.0002, + "loss": 1.1661, + "step": 2410 + }, + { + "epoch": 5.365853658536586, + "grad_norm": 0.9869807362556458, + "learning_rate": 0.0002, + "loss": 1.191, + "step": 2420 + }, + { + "epoch": 5.388026607538802, + "grad_norm": 0.9835564494132996, + "learning_rate": 0.0002, + "loss": 1.208, + "step": 2430 + }, + { + "epoch": 5.41019955654102, + "grad_norm": 0.8321971893310547, + "learning_rate": 0.0002, + "loss": 1.2734, + "step": 2440 + }, + { + "epoch": 5.4323725055432375, + "grad_norm": 0.8379601240158081, + "learning_rate": 0.0002, + "loss": 1.2308, + "step": 2450 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 0.9872745871543884, + "learning_rate": 0.0002, + "loss": 1.2464, + "step": 2460 + }, + { + "epoch": 5.476718403547672, + "grad_norm": 0.9455783367156982, + "learning_rate": 0.0002, + "loss": 1.283, + "step": 2470 + }, + { + "epoch": 5.498891352549889, + "grad_norm": 0.9594705700874329, + "learning_rate": 0.0002, + "loss": 1.2585, + "step": 2480 + }, + { + "epoch": 5.521064301552107, + "grad_norm": 1.036603331565857, + "learning_rate": 0.0002, + "loss": 1.2776, + "step": 2490 + }, + { + "epoch": 5.5432372505543235, + "grad_norm": 1.0329008102416992, + "learning_rate": 0.0002, + "loss": 1.2346, + "step": 2500 + }, + { + "epoch": 5.565410199556541, + "grad_norm": 0.90513014793396, + "learning_rate": 0.0002, + "loss": 1.2202, + "step": 2510 + }, + { + "epoch": 5.587583148558759, + "grad_norm": 1.107680320739746, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 2520 + }, + { + "epoch": 5.609756097560975, + "grad_norm": 0.8842377662658691, + "learning_rate": 0.0002, + "loss": 1.2117, + "step": 2530 + }, + { + "epoch": 5.631929046563193, + "grad_norm": 0.9856716990470886, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 2540 + }, + { + "epoch": 5.65410199556541, + "grad_norm": 1.0363198518753052, + "learning_rate": 0.0002, + "loss": 1.2579, + "step": 2550 + }, + { + "epoch": 5.676274944567627, + "grad_norm": 0.9366242289543152, + "learning_rate": 0.0002, + "loss": 1.236, + "step": 2560 + }, + { + "epoch": 5.698447893569845, + "grad_norm": 0.9180609583854675, + "learning_rate": 0.0002, + "loss": 1.2652, + "step": 2570 + }, + { + "epoch": 5.720620842572062, + "grad_norm": 0.96494460105896, + "learning_rate": 0.0002, + "loss": 1.2153, + "step": 2580 + }, + { + "epoch": 5.74279379157428, + "grad_norm": 1.066856861114502, + "learning_rate": 0.0002, + "loss": 1.2596, + "step": 2590 + }, + { + "epoch": 5.764966740576496, + "grad_norm": 1.0576446056365967, + "learning_rate": 0.0002, + "loss": 1.2437, + "step": 2600 + }, + { + "epoch": 5.787139689578714, + "grad_norm": 1.0688375234603882, + "learning_rate": 0.0002, + "loss": 1.2449, + "step": 2610 + }, + { + "epoch": 5.8093126385809315, + "grad_norm": 0.9294432401657104, + "learning_rate": 0.0002, + "loss": 1.2094, + "step": 2620 + }, + { + "epoch": 5.831485587583149, + "grad_norm": 0.9467836618423462, + "learning_rate": 0.0002, + "loss": 1.3705, + "step": 2630 + }, + { + "epoch": 5.853658536585366, + "grad_norm": 1.1947448253631592, + "learning_rate": 0.0002, + "loss": 1.334, + "step": 2640 + }, + { + "epoch": 5.875831485587583, + "grad_norm": 0.9225861430168152, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 2650 + }, + { + "epoch": 5.898004434589801, + "grad_norm": 0.9499539136886597, + "learning_rate": 0.0002, + "loss": 1.3356, + "step": 2660 + }, + { + "epoch": 5.9201773835920175, + "grad_norm": 0.9666298031806946, + "learning_rate": 0.0002, + "loss": 1.2898, + "step": 2670 + }, + { + "epoch": 5.942350332594235, + "grad_norm": 1.0549718141555786, + "learning_rate": 0.0002, + "loss": 1.1846, + "step": 2680 + }, + { + "epoch": 5.964523281596453, + "grad_norm": 1.1662505865097046, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 2690 + }, + { + "epoch": 5.986696230598669, + "grad_norm": 0.9200838208198547, + "learning_rate": 0.0002, + "loss": 1.2717, + "step": 2700 + }, + { + "epoch": 6.0, + "eval_loss": 2.089076280593872, + "eval_runtime": 95.2405, + "eval_samples_per_second": 5.407, + "eval_steps_per_second": 0.682, + "step": 2706 + }, + { + "epoch": 6.008869179600887, + "grad_norm": 1.0047595500946045, + "learning_rate": 0.0002, + "loss": 1.2085, + "step": 2710 + }, + { + "epoch": 6.031042128603104, + "grad_norm": 1.5315641164779663, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 2720 + }, + { + "epoch": 6.053215077605321, + "grad_norm": 1.2092695236206055, + "learning_rate": 0.0002, + "loss": 1.0955, + "step": 2730 + }, + { + "epoch": 6.075388026607539, + "grad_norm": 1.1834157705307007, + "learning_rate": 0.0002, + "loss": 1.108, + "step": 2740 + }, + { + "epoch": 6.097560975609756, + "grad_norm": 1.2534542083740234, + "learning_rate": 0.0002, + "loss": 1.0148, + "step": 2750 + }, + { + "epoch": 6.119733924611974, + "grad_norm": 1.2898602485656738, + "learning_rate": 0.0002, + "loss": 1.0422, + "step": 2760 + }, + { + "epoch": 6.14190687361419, + "grad_norm": 1.3397172689437866, + "learning_rate": 0.0002, + "loss": 1.0363, + "step": 2770 + }, + { + "epoch": 6.164079822616408, + "grad_norm": 1.18838632106781, + "learning_rate": 0.0002, + "loss": 1.0651, + "step": 2780 + }, + { + "epoch": 6.1862527716186255, + "grad_norm": 1.2524046897888184, + "learning_rate": 0.0002, + "loss": 1.048, + "step": 2790 + }, + { + "epoch": 6.208425720620842, + "grad_norm": 1.3325964212417603, + "learning_rate": 0.0002, + "loss": 1.0799, + "step": 2800 + }, + { + "epoch": 6.23059866962306, + "grad_norm": 1.3972342014312744, + "learning_rate": 0.0002, + "loss": 1.0768, + "step": 2810 + }, + { + "epoch": 6.252771618625277, + "grad_norm": 1.192122220993042, + "learning_rate": 0.0002, + "loss": 1.0822, + "step": 2820 + }, + { + "epoch": 6.274944567627495, + "grad_norm": 1.2018429040908813, + "learning_rate": 0.0002, + "loss": 1.0274, + "step": 2830 + }, + { + "epoch": 6.2971175166297115, + "grad_norm": 1.2017251253128052, + "learning_rate": 0.0002, + "loss": 1.045, + "step": 2840 + }, + { + "epoch": 6.319290465631929, + "grad_norm": 1.070663332939148, + "learning_rate": 0.0002, + "loss": 1.0522, + "step": 2850 + }, + { + "epoch": 6.341463414634147, + "grad_norm": 1.2376646995544434, + "learning_rate": 0.0002, + "loss": 1.1084, + "step": 2860 + }, + { + "epoch": 6.363636363636363, + "grad_norm": 1.4164553880691528, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 2870 + }, + { + "epoch": 6.385809312638581, + "grad_norm": 0.9863289594650269, + "learning_rate": 0.0002, + "loss": 1.0519, + "step": 2880 + }, + { + "epoch": 6.407982261640798, + "grad_norm": 1.1530284881591797, + "learning_rate": 0.0002, + "loss": 0.9746, + "step": 2890 + }, + { + "epoch": 6.430155210643015, + "grad_norm": 1.3614071607589722, + "learning_rate": 0.0002, + "loss": 1.0414, + "step": 2900 + }, + { + "epoch": 6.452328159645233, + "grad_norm": 1.4213203191757202, + "learning_rate": 0.0002, + "loss": 1.1097, + "step": 2910 + }, + { + "epoch": 6.47450110864745, + "grad_norm": 1.3584799766540527, + "learning_rate": 0.0002, + "loss": 1.0551, + "step": 2920 + }, + { + "epoch": 6.496674057649668, + "grad_norm": 1.1774920225143433, + "learning_rate": 0.0002, + "loss": 1.0888, + "step": 2930 + }, + { + "epoch": 6.518847006651884, + "grad_norm": 1.5063673257827759, + "learning_rate": 0.0002, + "loss": 1.0806, + "step": 2940 + }, + { + "epoch": 6.541019955654102, + "grad_norm": 1.3073967695236206, + "learning_rate": 0.0002, + "loss": 1.1157, + "step": 2950 + }, + { + "epoch": 6.5631929046563195, + "grad_norm": 1.2877048254013062, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 2960 + }, + { + "epoch": 6.585365853658536, + "grad_norm": 1.4681131839752197, + "learning_rate": 0.0002, + "loss": 1.0518, + "step": 2970 + }, + { + "epoch": 6.607538802660754, + "grad_norm": 1.364174246788025, + "learning_rate": 0.0002, + "loss": 1.1336, + "step": 2980 + }, + { + "epoch": 6.629711751662971, + "grad_norm": 1.3069559335708618, + "learning_rate": 0.0002, + "loss": 1.045, + "step": 2990 + }, + { + "epoch": 6.651884700665189, + "grad_norm": 1.152112364768982, + "learning_rate": 0.0002, + "loss": 1.059, + "step": 3000 + }, + { + "epoch": 6.674057649667406, + "grad_norm": 1.3854167461395264, + "learning_rate": 0.0002, + "loss": 1.1065, + "step": 3010 + }, + { + "epoch": 6.696230598669623, + "grad_norm": 1.3519569635391235, + "learning_rate": 0.0002, + "loss": 1.0792, + "step": 3020 + }, + { + "epoch": 6.718403547671841, + "grad_norm": 1.253912091255188, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 3030 + }, + { + "epoch": 6.740576496674057, + "grad_norm": 1.3960589170455933, + "learning_rate": 0.0002, + "loss": 1.0902, + "step": 3040 + }, + { + "epoch": 6.762749445676275, + "grad_norm": 1.3538455963134766, + "learning_rate": 0.0002, + "loss": 1.1028, + "step": 3050 + }, + { + "epoch": 6.7849223946784925, + "grad_norm": 1.1728484630584717, + "learning_rate": 0.0002, + "loss": 1.1072, + "step": 3060 + }, + { + "epoch": 6.807095343680709, + "grad_norm": 1.2287765741348267, + "learning_rate": 0.0002, + "loss": 1.138, + "step": 3070 + }, + { + "epoch": 6.829268292682927, + "grad_norm": 1.2122321128845215, + "learning_rate": 0.0002, + "loss": 1.0952, + "step": 3080 + }, + { + "epoch": 6.851441241685144, + "grad_norm": 1.3517614603042603, + "learning_rate": 0.0002, + "loss": 1.1051, + "step": 3090 + }, + { + "epoch": 6.873614190687362, + "grad_norm": 1.186508059501648, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 3100 + }, + { + "epoch": 6.8957871396895785, + "grad_norm": 1.2658056020736694, + "learning_rate": 0.0002, + "loss": 1.1307, + "step": 3110 + }, + { + "epoch": 6.917960088691796, + "grad_norm": 1.0459643602371216, + "learning_rate": 0.0002, + "loss": 1.0814, + "step": 3120 + }, + { + "epoch": 6.940133037694014, + "grad_norm": 1.1218708753585815, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 3130 + }, + { + "epoch": 6.96230598669623, + "grad_norm": 1.1161539554595947, + "learning_rate": 0.0002, + "loss": 1.0851, + "step": 3140 + }, + { + "epoch": 6.984478935698448, + "grad_norm": 1.312601923942566, + "learning_rate": 0.0002, + "loss": 1.1627, + "step": 3150 + }, + { + "epoch": 7.0, + "eval_loss": 2.216700315475464, + "eval_runtime": 132.441, + "eval_samples_per_second": 3.889, + "eval_steps_per_second": 0.491, + "step": 3157 + }, + { + "epoch": 7.006651884700665, + "grad_norm": 1.2042810916900635, + "learning_rate": 0.0002, + "loss": 1.0072, + "step": 3160 + }, + { + "epoch": 7.028824833702883, + "grad_norm": 1.298388957977295, + "learning_rate": 0.0002, + "loss": 0.9177, + "step": 3170 + }, + { + "epoch": 7.0509977827051, + "grad_norm": 1.5294439792633057, + "learning_rate": 0.0002, + "loss": 0.8978, + "step": 3180 + }, + { + "epoch": 7.073170731707317, + "grad_norm": 1.3496054410934448, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 3190 + }, + { + "epoch": 7.095343680709535, + "grad_norm": 1.4232285022735596, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 3200 + }, + { + "epoch": 7.117516629711751, + "grad_norm": 1.6650644540786743, + "learning_rate": 0.0002, + "loss": 0.9074, + "step": 3210 + }, + { + "epoch": 7.139689578713969, + "grad_norm": 1.4064364433288574, + "learning_rate": 0.0002, + "loss": 0.9157, + "step": 3220 + }, + { + "epoch": 7.1618625277161865, + "grad_norm": 1.6468620300292969, + "learning_rate": 0.0002, + "loss": 0.9173, + "step": 3230 + }, + { + "epoch": 7.184035476718403, + "grad_norm": 1.379271388053894, + "learning_rate": 0.0002, + "loss": 0.8946, + "step": 3240 + }, + { + "epoch": 7.206208425720621, + "grad_norm": 1.4626420736312866, + "learning_rate": 0.0002, + "loss": 0.8994, + "step": 3250 + }, + { + "epoch": 7.228381374722838, + "grad_norm": 1.6427521705627441, + "learning_rate": 0.0002, + "loss": 0.9178, + "step": 3260 + }, + { + "epoch": 7.250554323725056, + "grad_norm": 1.5199066400527954, + "learning_rate": 0.0002, + "loss": 0.9045, + "step": 3270 + }, + { + "epoch": 7.2727272727272725, + "grad_norm": 1.631585717201233, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 3280 + }, + { + "epoch": 7.29490022172949, + "grad_norm": 1.5489732027053833, + "learning_rate": 0.0002, + "loss": 0.95, + "step": 3290 + }, + { + "epoch": 7.317073170731708, + "grad_norm": 1.2737787961959839, + "learning_rate": 0.0002, + "loss": 0.9094, + "step": 3300 + }, + { + "epoch": 7.339246119733924, + "grad_norm": 1.582791805267334, + "learning_rate": 0.0002, + "loss": 0.872, + "step": 3310 + }, + { + "epoch": 7.361419068736142, + "grad_norm": 1.2628211975097656, + "learning_rate": 0.0002, + "loss": 0.9469, + "step": 3320 + }, + { + "epoch": 7.383592017738359, + "grad_norm": 1.451365351676941, + "learning_rate": 0.0002, + "loss": 0.9144, + "step": 3330 + }, + { + "epoch": 7.405764966740577, + "grad_norm": 1.5257638692855835, + "learning_rate": 0.0002, + "loss": 0.9293, + "step": 3340 + }, + { + "epoch": 7.427937915742794, + "grad_norm": 1.2424229383468628, + "learning_rate": 0.0002, + "loss": 0.9539, + "step": 3350 + }, + { + "epoch": 7.450110864745011, + "grad_norm": 1.503536343574524, + "learning_rate": 0.0002, + "loss": 0.8657, + "step": 3360 + }, + { + "epoch": 7.472283813747229, + "grad_norm": 1.2467454671859741, + "learning_rate": 0.0002, + "loss": 0.8716, + "step": 3370 + }, + { + "epoch": 7.494456762749445, + "grad_norm": 1.6118966341018677, + "learning_rate": 0.0002, + "loss": 0.9344, + "step": 3380 + }, + { + "epoch": 7.516629711751663, + "grad_norm": 1.399969220161438, + "learning_rate": 0.0002, + "loss": 0.9107, + "step": 3390 + }, + { + "epoch": 7.5388026607538805, + "grad_norm": 1.369147777557373, + "learning_rate": 0.0002, + "loss": 0.9315, + "step": 3400 + }, + { + "epoch": 7.560975609756097, + "grad_norm": 1.741153359413147, + "learning_rate": 0.0002, + "loss": 1.0039, + "step": 3410 + }, + { + "epoch": 7.583148558758315, + "grad_norm": 1.436596393585205, + "learning_rate": 0.0002, + "loss": 0.8504, + "step": 3420 + }, + { + "epoch": 7.605321507760532, + "grad_norm": 1.7102857828140259, + "learning_rate": 0.0002, + "loss": 0.9537, + "step": 3430 + }, + { + "epoch": 7.627494456762749, + "grad_norm": 1.3728266954421997, + "learning_rate": 0.0002, + "loss": 0.9977, + "step": 3440 + }, + { + "epoch": 7.6496674057649665, + "grad_norm": 1.4129058122634888, + "learning_rate": 0.0002, + "loss": 1.015, + "step": 3450 + }, + { + "epoch": 7.671840354767184, + "grad_norm": 1.6068756580352783, + "learning_rate": 0.0002, + "loss": 0.9672, + "step": 3460 + }, + { + "epoch": 7.694013303769402, + "grad_norm": 1.376522183418274, + "learning_rate": 0.0002, + "loss": 0.9396, + "step": 3470 + }, + { + "epoch": 7.716186252771618, + "grad_norm": 1.5918605327606201, + "learning_rate": 0.0002, + "loss": 0.9398, + "step": 3480 + }, + { + "epoch": 7.738359201773836, + "grad_norm": 1.3888970613479614, + "learning_rate": 0.0002, + "loss": 0.885, + "step": 3490 + }, + { + "epoch": 7.760532150776053, + "grad_norm": 1.3949130773544312, + "learning_rate": 0.0002, + "loss": 0.9283, + "step": 3500 + }, + { + "epoch": 7.782705099778271, + "grad_norm": 1.6619991064071655, + "learning_rate": 0.0002, + "loss": 0.9194, + "step": 3510 + }, + { + "epoch": 7.804878048780488, + "grad_norm": 1.6583504676818848, + "learning_rate": 0.0002, + "loss": 0.9956, + "step": 3520 + }, + { + "epoch": 7.827050997782705, + "grad_norm": 1.5198252201080322, + "learning_rate": 0.0002, + "loss": 0.9093, + "step": 3530 + }, + { + "epoch": 7.849223946784923, + "grad_norm": 1.5402783155441284, + "learning_rate": 0.0002, + "loss": 0.9916, + "step": 3540 + }, + { + "epoch": 7.871396895787139, + "grad_norm": 1.358048915863037, + "learning_rate": 0.0002, + "loss": 0.9848, + "step": 3550 + }, + { + "epoch": 7.893569844789357, + "grad_norm": 1.3957476615905762, + "learning_rate": 0.0002, + "loss": 0.9484, + "step": 3560 + }, + { + "epoch": 7.9157427937915745, + "grad_norm": 1.381712555885315, + "learning_rate": 0.0002, + "loss": 0.9471, + "step": 3570 + }, + { + "epoch": 7.937915742793791, + "grad_norm": 1.5783199071884155, + "learning_rate": 0.0002, + "loss": 0.9791, + "step": 3580 + }, + { + "epoch": 7.960088691796009, + "grad_norm": 1.5801693201065063, + "learning_rate": 0.0002, + "loss": 0.8974, + "step": 3590 + }, + { + "epoch": 7.982261640798226, + "grad_norm": 1.4844671487808228, + "learning_rate": 0.0002, + "loss": 0.9835, + "step": 3600 + }, + { + "epoch": 8.0, + "eval_loss": 2.4150607585906982, + "eval_runtime": 83.4967, + "eval_samples_per_second": 6.168, + "eval_steps_per_second": 0.778, + "step": 3608 + } + ], + "logging_steps": 10, + "max_steps": 3608, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.669701613607977e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcdc3b48752889d03e0bc8f748c28d3a3eeb1026 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-3608/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4002b3199d379e5c6c8494417ad0ab221d2f73797ae8bbbb3b91f811953dbe5c +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b14b12ea5144e05f4cf3ff78c79e105015e3dde2 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4ed5678994d9c12d7bbe236f06c54565859b0803ff56b7b8d2f5f2c8d3fe85f +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d69f26777b73f67944e7ad73f577a089d2600383 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7e446dc53f949486c1c79e1194a799ad0374fec0145a525bfa33f2824230812 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3633f1f9ea65dc690a4ca9f762e414fdecc08967 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddf6b3302570570052badd87b10ba67446935e4e7c5bbb25db6fd1fb0f93fd74 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..709c2d6fabb2a4046b3141a35c1e73249fb91993 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4a1018cf730b218cd98db8a19bd4f5cc950e217bcbf2ebe4dd93f97fa1d9a08 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0cb844da95dc98b07405e6a22d751dcf4bcacec8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/trainer_state.json @@ -0,0 +1,356 @@ +{ + "best_metric": 1.8323718309402466, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451", + "epoch": 1.0, + "eval_steps": 10, + "global_step": 451, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022172949002217297, + "grad_norm": 0.6454975008964539, + "learning_rate": 0.0002, + "loss": 2.684, + "step": 10 + }, + { + "epoch": 0.04434589800443459, + "grad_norm": 0.5452715158462524, + "learning_rate": 0.0002, + "loss": 2.2694, + "step": 20 + }, + { + "epoch": 0.06651884700665188, + "grad_norm": 0.5502195358276367, + "learning_rate": 0.0002, + "loss": 2.0796, + "step": 30 + }, + { + "epoch": 0.08869179600886919, + "grad_norm": 0.48551198840141296, + "learning_rate": 0.0002, + "loss": 1.9132, + "step": 40 + }, + { + "epoch": 0.11086474501108648, + "grad_norm": 0.47822514176368713, + "learning_rate": 0.0002, + "loss": 2.016, + "step": 50 + }, + { + "epoch": 0.13303769401330376, + "grad_norm": 0.5125395655632019, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 60 + }, + { + "epoch": 0.15521064301552107, + "grad_norm": 0.4600693881511688, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 70 + }, + { + "epoch": 0.17738359201773837, + "grad_norm": 0.7180814743041992, + "learning_rate": 0.0002, + "loss": 2.0057, + "step": 80 + }, + { + "epoch": 0.19955654101995565, + "grad_norm": 0.4712974429130554, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 90 + }, + { + "epoch": 0.22172949002217296, + "grad_norm": 0.4673261344432831, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 100 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 0.4129070043563843, + "learning_rate": 0.0002, + "loss": 1.9346, + "step": 110 + }, + { + "epoch": 0.2660753880266075, + "grad_norm": 0.3859104812145233, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 120 + }, + { + "epoch": 0.28824833702882485, + "grad_norm": 0.40966713428497314, + "learning_rate": 0.0002, + "loss": 1.8922, + "step": 130 + }, + { + "epoch": 0.31042128603104213, + "grad_norm": 0.3685867488384247, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 140 + }, + { + "epoch": 0.3325942350332594, + "grad_norm": 0.39279988408088684, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 150 + }, + { + "epoch": 0.35476718403547675, + "grad_norm": 0.4195398986339569, + "learning_rate": 0.0002, + "loss": 1.8556, + "step": 160 + }, + { + "epoch": 0.376940133037694, + "grad_norm": 0.469802588224411, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 170 + }, + { + "epoch": 0.3991130820399113, + "grad_norm": 0.4069509208202362, + "learning_rate": 0.0002, + "loss": 1.8135, + "step": 180 + }, + { + "epoch": 0.4212860310421286, + "grad_norm": 0.47832027077674866, + "learning_rate": 0.0002, + "loss": 1.8429, + "step": 190 + }, + { + "epoch": 0.4434589800443459, + "grad_norm": 0.3376411199569702, + "learning_rate": 0.0002, + "loss": 1.781, + "step": 200 + }, + { + "epoch": 0.4656319290465632, + "grad_norm": 0.3787185847759247, + "learning_rate": 0.0002, + "loss": 1.8562, + "step": 210 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 0.40322697162628174, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 220 + }, + { + "epoch": 0.5099778270509978, + "grad_norm": 0.3710436522960663, + "learning_rate": 0.0002, + "loss": 1.8007, + "step": 230 + }, + { + "epoch": 0.532150776053215, + "grad_norm": 0.3723200261592865, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 240 + }, + { + "epoch": 0.5543237250554324, + "grad_norm": 0.3457179069519043, + "learning_rate": 0.0002, + "loss": 1.852, + "step": 250 + }, + { + "epoch": 0.5764966740576497, + "grad_norm": 0.35369473695755005, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 260 + }, + { + "epoch": 0.5986696230598669, + "grad_norm": 0.3667483329772949, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 270 + }, + { + "epoch": 0.6208425720620843, + "grad_norm": 0.4023273289203644, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 280 + }, + { + "epoch": 0.6430155210643016, + "grad_norm": 0.3601929843425751, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 290 + }, + { + "epoch": 0.6651884700665188, + "grad_norm": 0.32610392570495605, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 300 + }, + { + "epoch": 0.6873614190687362, + "grad_norm": 0.40528756380081177, + "learning_rate": 0.0002, + "loss": 1.8412, + "step": 310 + }, + { + "epoch": 0.7095343680709535, + "grad_norm": 0.34639739990234375, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 320 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 0.3794991374015808, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 330 + }, + { + "epoch": 0.753880266075388, + "grad_norm": 0.34203875064849854, + "learning_rate": 0.0002, + "loss": 1.8678, + "step": 340 + }, + { + "epoch": 0.7760532150776053, + "grad_norm": 0.36692821979522705, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 350 + }, + { + "epoch": 0.7982261640798226, + "grad_norm": 0.3701125979423523, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 360 + }, + { + "epoch": 0.8203991130820399, + "grad_norm": 0.3971416652202606, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 370 + }, + { + "epoch": 0.8425720620842572, + "grad_norm": 0.3751989006996155, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 380 + }, + { + "epoch": 0.8647450110864745, + "grad_norm": 0.35116496682167053, + "learning_rate": 0.0002, + "loss": 1.8281, + "step": 390 + }, + { + "epoch": 0.8869179600886918, + "grad_norm": 0.3672674894332886, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 400 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.34648260474205017, + "learning_rate": 0.0002, + "loss": 1.8293, + "step": 410 + }, + { + "epoch": 0.9312638580931264, + "grad_norm": 0.4497389793395996, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 420 + }, + { + "epoch": 0.9534368070953437, + "grad_norm": 0.33595147728919983, + "learning_rate": 0.0002, + "loss": 1.8405, + "step": 430 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 0.3130456805229187, + "learning_rate": 0.0002, + "loss": 1.8137, + "step": 440 + }, + { + "epoch": 0.9977827050997783, + "grad_norm": 0.36480239033699036, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8323718309402466, + "eval_runtime": 79.9603, + "eval_samples_per_second": 6.441, + "eval_steps_per_second": 0.813, + "step": 451 + } + ], + "logging_steps": 10, + "max_steps": 3608, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.087127017009971e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcdc3b48752889d03e0bc8f748c28d3a3eeb1026 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4002b3199d379e5c6c8494417ad0ab221d2f73797ae8bbbb3b91f811953dbe5c +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b9cd9d237e2902cda3565e5ce93b96f803222587 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:366052a3942a4a6232fad69aba123e05e8fa724863182bed58bcc2bcc5938c4c +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..be89ac8f1b0603458cc39f3e8dc03c29c2ed9696 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:805175f3bdc6dce7148cd29a2b4ff4c2b18b9106540a4b15b5ed5e6772579b99 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e07ac01ed71a4f200bd866b09a66ec52c8e82f6b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9875dd90f78c85d22815773f84213401b64016431cca2c21e7e7a6133b2ef951 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..aef2fbe1a5ff9cfda3f4daa39f16f8bc41ffc86c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac076fedfed67e01ff5d0ef79efbf3da84bbfc745aa1932d3cea2c023519ecaa +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..88ad2a01af475989c31fc9036d0ae5f22d788ab0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/trainer_state.json @@ -0,0 +1,679 @@ +{ + "best_metric": 1.8243104219436646, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 902, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022172949002217297, + "grad_norm": 0.6454975008964539, + "learning_rate": 0.0002, + "loss": 2.684, + "step": 10 + }, + { + "epoch": 0.04434589800443459, + "grad_norm": 0.5452715158462524, + "learning_rate": 0.0002, + "loss": 2.2694, + "step": 20 + }, + { + "epoch": 0.06651884700665188, + "grad_norm": 0.5502195358276367, + "learning_rate": 0.0002, + "loss": 2.0796, + "step": 30 + }, + { + "epoch": 0.08869179600886919, + "grad_norm": 0.48551198840141296, + "learning_rate": 0.0002, + "loss": 1.9132, + "step": 40 + }, + { + "epoch": 0.11086474501108648, + "grad_norm": 0.47822514176368713, + "learning_rate": 0.0002, + "loss": 2.016, + "step": 50 + }, + { + "epoch": 0.13303769401330376, + "grad_norm": 0.5125395655632019, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 60 + }, + { + "epoch": 0.15521064301552107, + "grad_norm": 0.4600693881511688, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 70 + }, + { + "epoch": 0.17738359201773837, + "grad_norm": 0.7180814743041992, + "learning_rate": 0.0002, + "loss": 2.0057, + "step": 80 + }, + { + "epoch": 0.19955654101995565, + "grad_norm": 0.4712974429130554, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 90 + }, + { + "epoch": 0.22172949002217296, + "grad_norm": 0.4673261344432831, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 100 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 0.4129070043563843, + "learning_rate": 0.0002, + "loss": 1.9346, + "step": 110 + }, + { + "epoch": 0.2660753880266075, + "grad_norm": 0.3859104812145233, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 120 + }, + { + "epoch": 0.28824833702882485, + "grad_norm": 0.40966713428497314, + "learning_rate": 0.0002, + "loss": 1.8922, + "step": 130 + }, + { + "epoch": 0.31042128603104213, + "grad_norm": 0.3685867488384247, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 140 + }, + { + "epoch": 0.3325942350332594, + "grad_norm": 0.39279988408088684, + "learning_rate": 0.0002, + "loss": 1.9017, + "step": 150 + }, + { + "epoch": 0.35476718403547675, + "grad_norm": 0.4195398986339569, + "learning_rate": 0.0002, + "loss": 1.8556, + "step": 160 + }, + { + "epoch": 0.376940133037694, + "grad_norm": 0.469802588224411, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 170 + }, + { + "epoch": 0.3991130820399113, + "grad_norm": 0.4069509208202362, + "learning_rate": 0.0002, + "loss": 1.8135, + "step": 180 + }, + { + "epoch": 0.4212860310421286, + "grad_norm": 0.47832027077674866, + "learning_rate": 0.0002, + "loss": 1.8429, + "step": 190 + }, + { + "epoch": 0.4434589800443459, + "grad_norm": 0.3376411199569702, + "learning_rate": 0.0002, + "loss": 1.781, + "step": 200 + }, + { + "epoch": 0.4656319290465632, + "grad_norm": 0.3787185847759247, + "learning_rate": 0.0002, + "loss": 1.8562, + "step": 210 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 0.40322697162628174, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 220 + }, + { + "epoch": 0.5099778270509978, + "grad_norm": 0.3710436522960663, + "learning_rate": 0.0002, + "loss": 1.8007, + "step": 230 + }, + { + "epoch": 0.532150776053215, + "grad_norm": 0.3723200261592865, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 240 + }, + { + "epoch": 0.5543237250554324, + "grad_norm": 0.3457179069519043, + "learning_rate": 0.0002, + "loss": 1.852, + "step": 250 + }, + { + "epoch": 0.5764966740576497, + "grad_norm": 0.35369473695755005, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 260 + }, + { + "epoch": 0.5986696230598669, + "grad_norm": 0.3667483329772949, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 270 + }, + { + "epoch": 0.6208425720620843, + "grad_norm": 0.4023273289203644, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 280 + }, + { + "epoch": 0.6430155210643016, + "grad_norm": 0.3601929843425751, + "learning_rate": 0.0002, + "loss": 1.8382, + "step": 290 + }, + { + "epoch": 0.6651884700665188, + "grad_norm": 0.32610392570495605, + "learning_rate": 0.0002, + "loss": 1.8152, + "step": 300 + }, + { + "epoch": 0.6873614190687362, + "grad_norm": 0.40528756380081177, + "learning_rate": 0.0002, + "loss": 1.8412, + "step": 310 + }, + { + "epoch": 0.7095343680709535, + "grad_norm": 0.34639739990234375, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 320 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 0.3794991374015808, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 330 + }, + { + "epoch": 0.753880266075388, + "grad_norm": 0.34203875064849854, + "learning_rate": 0.0002, + "loss": 1.8678, + "step": 340 + }, + { + "epoch": 0.7760532150776053, + "grad_norm": 0.36692821979522705, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 350 + }, + { + "epoch": 0.7982261640798226, + "grad_norm": 0.3701125979423523, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 360 + }, + { + "epoch": 0.8203991130820399, + "grad_norm": 0.3971416652202606, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 370 + }, + { + "epoch": 0.8425720620842572, + "grad_norm": 0.3751989006996155, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 380 + }, + { + "epoch": 0.8647450110864745, + "grad_norm": 0.35116496682167053, + "learning_rate": 0.0002, + "loss": 1.8281, + "step": 390 + }, + { + "epoch": 0.8869179600886918, + "grad_norm": 0.3672674894332886, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 400 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.34648260474205017, + "learning_rate": 0.0002, + "loss": 1.8293, + "step": 410 + }, + { + "epoch": 0.9312638580931264, + "grad_norm": 0.4497389793395996, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 420 + }, + { + "epoch": 0.9534368070953437, + "grad_norm": 0.33595147728919983, + "learning_rate": 0.0002, + "loss": 1.8405, + "step": 430 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 0.3130456805229187, + "learning_rate": 0.0002, + "loss": 1.8137, + "step": 440 + }, + { + "epoch": 0.9977827050997783, + "grad_norm": 0.36480239033699036, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8323718309402466, + "eval_runtime": 79.9603, + "eval_samples_per_second": 6.441, + "eval_steps_per_second": 0.813, + "step": 451 + }, + { + "epoch": 1.0199556541019956, + "grad_norm": 0.3840029835700989, + "learning_rate": 0.0002, + "loss": 1.7921, + "step": 460 + }, + { + "epoch": 1.042128603104213, + "grad_norm": 0.33457425236701965, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 470 + }, + { + "epoch": 1.06430155210643, + "grad_norm": 0.35766592621803284, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 480 + }, + { + "epoch": 1.0864745011086474, + "grad_norm": 0.38070937991142273, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 490 + }, + { + "epoch": 1.1086474501108647, + "grad_norm": 0.38546547293663025, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 500 + }, + { + "epoch": 1.130820399113082, + "grad_norm": 0.384104460477829, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 510 + }, + { + "epoch": 1.1529933481152994, + "grad_norm": 0.3556116819381714, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 520 + }, + { + "epoch": 1.1751662971175167, + "grad_norm": 0.4110541343688965, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 530 + }, + { + "epoch": 1.1973392461197339, + "grad_norm": 0.46503177285194397, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 540 + }, + { + "epoch": 1.2195121951219512, + "grad_norm": 0.4366816580295563, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 550 + }, + { + "epoch": 1.2416851441241685, + "grad_norm": 0.379986047744751, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 560 + }, + { + "epoch": 1.2638580931263859, + "grad_norm": 0.3920869529247284, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 570 + }, + { + "epoch": 1.2860310421286032, + "grad_norm": 0.4013986587524414, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 580 + }, + { + "epoch": 1.3082039911308203, + "grad_norm": 0.39104390144348145, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 590 + }, + { + "epoch": 1.3303769401330376, + "grad_norm": 0.40515613555908203, + "learning_rate": 0.0002, + "loss": 1.7822, + "step": 600 + }, + { + "epoch": 1.352549889135255, + "grad_norm": 0.4212331473827362, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 610 + }, + { + "epoch": 1.3747228381374723, + "grad_norm": 0.36040815711021423, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 620 + }, + { + "epoch": 1.3968957871396896, + "grad_norm": 0.3950865864753723, + "learning_rate": 0.0002, + "loss": 1.7467, + "step": 630 + }, + { + "epoch": 1.4190687361419068, + "grad_norm": 0.3934709131717682, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 640 + }, + { + "epoch": 1.441241685144124, + "grad_norm": 0.3905350863933563, + "learning_rate": 0.0002, + "loss": 1.7783, + "step": 650 + }, + { + "epoch": 1.4634146341463414, + "grad_norm": 0.4322686493396759, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 660 + }, + { + "epoch": 1.4855875831485588, + "grad_norm": 0.35697034001350403, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 670 + }, + { + "epoch": 1.507760532150776, + "grad_norm": 0.38570451736450195, + "learning_rate": 0.0002, + "loss": 1.6764, + "step": 680 + }, + { + "epoch": 1.5299334811529932, + "grad_norm": 0.3804517090320587, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 690 + }, + { + "epoch": 1.5521064301552108, + "grad_norm": 0.4938165247440338, + "learning_rate": 0.0002, + "loss": 1.7725, + "step": 700 + }, + { + "epoch": 1.5742793791574279, + "grad_norm": 0.43075236678123474, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 710 + }, + { + "epoch": 1.5964523281596452, + "grad_norm": 0.40434643626213074, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 720 + }, + { + "epoch": 1.6186252771618626, + "grad_norm": 0.3874157667160034, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 730 + }, + { + "epoch": 1.6407982261640797, + "grad_norm": 0.3645969331264496, + "learning_rate": 0.0002, + "loss": 1.8141, + "step": 740 + }, + { + "epoch": 1.6629711751662972, + "grad_norm": 0.38588255643844604, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 750 + }, + { + "epoch": 1.6851441241685143, + "grad_norm": 0.39252519607543945, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 760 + }, + { + "epoch": 1.7073170731707317, + "grad_norm": 0.36165162920951843, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 770 + }, + { + "epoch": 1.729490022172949, + "grad_norm": 0.36677947640419006, + "learning_rate": 0.0002, + "loss": 1.7172, + "step": 780 + }, + { + "epoch": 1.7516629711751663, + "grad_norm": 0.374881774187088, + "learning_rate": 0.0002, + "loss": 1.7737, + "step": 790 + }, + { + "epoch": 1.7738359201773837, + "grad_norm": 0.4530802369117737, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 800 + }, + { + "epoch": 1.7960088691796008, + "grad_norm": 0.3879568576812744, + "learning_rate": 0.0002, + "loss": 1.7799, + "step": 810 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.3710079789161682, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 820 + }, + { + "epoch": 1.8403547671840355, + "grad_norm": 0.3831799030303955, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 830 + }, + { + "epoch": 1.8625277161862528, + "grad_norm": 0.3958432376384735, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 840 + }, + { + "epoch": 1.8847006651884701, + "grad_norm": 0.4129294157028198, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 850 + }, + { + "epoch": 1.9068736141906872, + "grad_norm": 0.3714745044708252, + "learning_rate": 0.0002, + "loss": 1.6506, + "step": 860 + }, + { + "epoch": 1.9290465631929048, + "grad_norm": 0.40176868438720703, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 870 + }, + { + "epoch": 1.951219512195122, + "grad_norm": 0.36937767267227173, + "learning_rate": 0.0002, + "loss": 1.7557, + "step": 880 + }, + { + "epoch": 1.9733924611973392, + "grad_norm": 0.40242597460746765, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 890 + }, + { + "epoch": 1.9955654101995566, + "grad_norm": 0.3515510559082031, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 900 + }, + { + "epoch": 2.0, + "eval_loss": 1.8243104219436646, + "eval_runtime": 107.8856, + "eval_samples_per_second": 4.774, + "eval_steps_per_second": 0.602, + "step": 902 + } + ], + "logging_steps": 10, + "max_steps": 3608, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.174254034019942e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcdc3b48752889d03e0bc8f748c28d3a3eeb1026 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4002b3199d379e5c6c8494417ad0ab221d2f73797ae8bbbb3b91f811953dbe5c +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcdc3b48752889d03e0bc8f748c28d3a3eeb1026 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4002b3199d379e5c6c8494417ad0ab221d2f73797ae8bbbb3b91f811953dbe5c +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..06eedacf32f4436fd54ff2ec620a1da091f88013 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 1.0, "step": 451, "epoch_duration": 1132.8208858966827, "total_accumulated_duration": 1132.8208858966827, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.684, "grad_norm": 0.6454975008964539, "learning_rate": 0.0002, "epoch": 0.022172949002217297, "step": 10}, {"loss": 2.2694, "grad_norm": 0.5452715158462524, "learning_rate": 0.0002, "epoch": 0.04434589800443459, "step": 20}, {"loss": 2.0796, "grad_norm": 0.5502195358276367, "learning_rate": 0.0002, "epoch": 0.06651884700665188, "step": 30}, {"loss": 1.9132, "grad_norm": 0.48551198840141296, "learning_rate": 0.0002, "epoch": 0.08869179600886919, "step": 40}, {"loss": 2.016, "grad_norm": 0.47822514176368713, "learning_rate": 0.0002, "epoch": 0.11086474501108648, "step": 50}, {"loss": 1.9455, "grad_norm": 0.5125395655632019, "learning_rate": 0.0002, "epoch": 0.13303769401330376, "step": 60}, {"loss": 1.9017, "grad_norm": 0.4600693881511688, "learning_rate": 0.0002, "epoch": 0.15521064301552107, "step": 70}, {"loss": 2.0057, "grad_norm": 0.7180814743041992, "learning_rate": 0.0002, "epoch": 0.17738359201773837, "step": 80}, {"loss": 1.8925, "grad_norm": 0.4712974429130554, "learning_rate": 0.0002, "epoch": 0.19955654101995565, "step": 90}, {"loss": 1.8386, "grad_norm": 0.4673261344432831, "learning_rate": 0.0002, "epoch": 0.22172949002217296, "step": 100}, {"loss": 1.9346, "grad_norm": 0.4129070043563843, "learning_rate": 0.0002, "epoch": 0.24390243902439024, "step": 110}, {"loss": 1.9018, "grad_norm": 0.3859104812145233, "learning_rate": 0.0002, "epoch": 0.2660753880266075, "step": 120}, {"loss": 1.8922, "grad_norm": 0.40966713428497314, "learning_rate": 0.0002, "epoch": 0.28824833702882485, "step": 130}, {"loss": 1.846, "grad_norm": 0.3685867488384247, "learning_rate": 0.0002, "epoch": 0.31042128603104213, "step": 140}, {"loss": 1.9017, "grad_norm": 0.39279988408088684, "learning_rate": 0.0002, "epoch": 0.3325942350332594, "step": 150}, {"loss": 1.8556, "grad_norm": 0.4195398986339569, "learning_rate": 0.0002, "epoch": 0.35476718403547675, "step": 160}, {"loss": 1.7883, "grad_norm": 0.469802588224411, "learning_rate": 0.0002, "epoch": 0.376940133037694, "step": 170}, {"loss": 1.8135, "grad_norm": 0.4069509208202362, "learning_rate": 0.0002, "epoch": 0.3991130820399113, "step": 180}, {"loss": 1.8429, "grad_norm": 0.47832027077674866, "learning_rate": 0.0002, "epoch": 0.4212860310421286, "step": 190}, {"loss": 1.781, "grad_norm": 0.3376411199569702, "learning_rate": 0.0002, "epoch": 0.4434589800443459, "step": 200}, {"loss": 1.8562, "grad_norm": 0.3787185847759247, "learning_rate": 0.0002, "epoch": 0.4656319290465632, "step": 210}, {"loss": 1.8352, "grad_norm": 0.40322697162628174, "learning_rate": 0.0002, "epoch": 0.4878048780487805, "step": 220}, {"loss": 1.8007, "grad_norm": 0.3710436522960663, "learning_rate": 0.0002, "epoch": 0.5099778270509978, "step": 230}, {"loss": 1.8528, "grad_norm": 0.3723200261592865, "learning_rate": 0.0002, "epoch": 0.532150776053215, "step": 240}, {"loss": 1.852, "grad_norm": 0.3457179069519043, "learning_rate": 0.0002, "epoch": 0.5543237250554324, "step": 250}, {"loss": 1.8175, "grad_norm": 0.35369473695755005, "learning_rate": 0.0002, "epoch": 0.5764966740576497, "step": 260}, {"loss": 1.7742, "grad_norm": 0.3667483329772949, "learning_rate": 0.0002, "epoch": 0.5986696230598669, "step": 270}, {"loss": 1.8152, "grad_norm": 0.4023273289203644, "learning_rate": 0.0002, "epoch": 0.6208425720620843, "step": 280}, {"loss": 1.8382, "grad_norm": 0.3601929843425751, "learning_rate": 0.0002, "epoch": 0.6430155210643016, "step": 290}, {"loss": 1.8152, "grad_norm": 0.32610392570495605, "learning_rate": 0.0002, "epoch": 0.6651884700665188, "step": 300}, {"loss": 1.8412, "grad_norm": 0.40528756380081177, "learning_rate": 0.0002, "epoch": 0.6873614190687362, "step": 310}, {"loss": 1.8528, "grad_norm": 0.34639739990234375, "learning_rate": 0.0002, "epoch": 0.7095343680709535, "step": 320}, {"loss": 1.807, "grad_norm": 0.3794991374015808, "learning_rate": 0.0002, "epoch": 0.7317073170731707, "step": 330}, {"loss": 1.8678, "grad_norm": 0.34203875064849854, "learning_rate": 0.0002, "epoch": 0.753880266075388, "step": 340}, {"loss": 1.7605, "grad_norm": 0.36692821979522705, "learning_rate": 0.0002, "epoch": 0.7760532150776053, "step": 350}, {"loss": 1.7535, "grad_norm": 0.3701125979423523, "learning_rate": 0.0002, "epoch": 0.7982261640798226, "step": 360}, {"loss": 1.7574, "grad_norm": 0.3971416652202606, "learning_rate": 0.0002, "epoch": 0.8203991130820399, "step": 370}, {"loss": 1.8476, "grad_norm": 0.3751989006996155, "learning_rate": 0.0002, "epoch": 0.8425720620842572, "step": 380}, {"loss": 1.8281, "grad_norm": 0.35116496682167053, "learning_rate": 0.0002, "epoch": 0.8647450110864745, "step": 390}, {"loss": 1.7459, "grad_norm": 0.3672674894332886, "learning_rate": 0.0002, "epoch": 0.8869179600886918, "step": 400}, {"loss": 1.8293, "grad_norm": 0.34648260474205017, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 410}, {"loss": 1.7601, "grad_norm": 0.4497389793395996, "learning_rate": 0.0002, "epoch": 0.9312638580931264, "step": 420}, {"loss": 1.8405, "grad_norm": 0.33595147728919983, "learning_rate": 0.0002, "epoch": 0.9534368070953437, "step": 430}, {"loss": 1.8137, "grad_norm": 0.3130456805229187, "learning_rate": 0.0002, "epoch": 0.975609756097561, "step": 440}, {"loss": 1.8226, "grad_norm": 0.36480239033699036, "learning_rate": 0.0002, "epoch": 0.9977827050997783, "step": 450}]} +{"epoch": 2.0, "step": 902, "epoch_duration": 1181.283695936203, "total_accumulated_duration": 2314.1045818328857, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-451", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.684, "grad_norm": 0.6454975008964539, "learning_rate": 0.0002, "epoch": 0.022172949002217297, "step": 10}, {"loss": 2.2694, "grad_norm": 0.5452715158462524, "learning_rate": 0.0002, "epoch": 0.04434589800443459, "step": 20}, {"loss": 2.0796, "grad_norm": 0.5502195358276367, "learning_rate": 0.0002, "epoch": 0.06651884700665188, "step": 30}, {"loss": 1.9132, "grad_norm": 0.48551198840141296, "learning_rate": 0.0002, "epoch": 0.08869179600886919, "step": 40}, {"loss": 2.016, "grad_norm": 0.47822514176368713, "learning_rate": 0.0002, "epoch": 0.11086474501108648, "step": 50}, {"loss": 1.9455, "grad_norm": 0.5125395655632019, "learning_rate": 0.0002, "epoch": 0.13303769401330376, "step": 60}, {"loss": 1.9017, "grad_norm": 0.4600693881511688, "learning_rate": 0.0002, "epoch": 0.15521064301552107, "step": 70}, {"loss": 2.0057, "grad_norm": 0.7180814743041992, "learning_rate": 0.0002, "epoch": 0.17738359201773837, "step": 80}, {"loss": 1.8925, "grad_norm": 0.4712974429130554, "learning_rate": 0.0002, "epoch": 0.19955654101995565, "step": 90}, {"loss": 1.8386, "grad_norm": 0.4673261344432831, "learning_rate": 0.0002, "epoch": 0.22172949002217296, "step": 100}, {"loss": 1.9346, "grad_norm": 0.4129070043563843, "learning_rate": 0.0002, "epoch": 0.24390243902439024, "step": 110}, {"loss": 1.9018, "grad_norm": 0.3859104812145233, "learning_rate": 0.0002, "epoch": 0.2660753880266075, "step": 120}, {"loss": 1.8922, "grad_norm": 0.40966713428497314, "learning_rate": 0.0002, "epoch": 0.28824833702882485, "step": 130}, {"loss": 1.846, "grad_norm": 0.3685867488384247, "learning_rate": 0.0002, "epoch": 0.31042128603104213, "step": 140}, {"loss": 1.9017, "grad_norm": 0.39279988408088684, "learning_rate": 0.0002, "epoch": 0.3325942350332594, "step": 150}, {"loss": 1.8556, "grad_norm": 0.4195398986339569, "learning_rate": 0.0002, "epoch": 0.35476718403547675, "step": 160}, {"loss": 1.7883, "grad_norm": 0.469802588224411, "learning_rate": 0.0002, "epoch": 0.376940133037694, "step": 170}, {"loss": 1.8135, "grad_norm": 0.4069509208202362, "learning_rate": 0.0002, "epoch": 0.3991130820399113, "step": 180}, {"loss": 1.8429, "grad_norm": 0.47832027077674866, "learning_rate": 0.0002, "epoch": 0.4212860310421286, "step": 190}, {"loss": 1.781, "grad_norm": 0.3376411199569702, "learning_rate": 0.0002, "epoch": 0.4434589800443459, "step": 200}, {"loss": 1.8562, "grad_norm": 0.3787185847759247, "learning_rate": 0.0002, "epoch": 0.4656319290465632, "step": 210}, {"loss": 1.8352, "grad_norm": 0.40322697162628174, "learning_rate": 0.0002, "epoch": 0.4878048780487805, "step": 220}, {"loss": 1.8007, "grad_norm": 0.3710436522960663, "learning_rate": 0.0002, "epoch": 0.5099778270509978, "step": 230}, {"loss": 1.8528, "grad_norm": 0.3723200261592865, "learning_rate": 0.0002, "epoch": 0.532150776053215, "step": 240}, {"loss": 1.852, "grad_norm": 0.3457179069519043, "learning_rate": 0.0002, "epoch": 0.5543237250554324, "step": 250}, {"loss": 1.8175, "grad_norm": 0.35369473695755005, "learning_rate": 0.0002, "epoch": 0.5764966740576497, "step": 260}, {"loss": 1.7742, "grad_norm": 0.3667483329772949, "learning_rate": 0.0002, "epoch": 0.5986696230598669, "step": 270}, {"loss": 1.8152, "grad_norm": 0.4023273289203644, "learning_rate": 0.0002, "epoch": 0.6208425720620843, "step": 280}, {"loss": 1.8382, "grad_norm": 0.3601929843425751, "learning_rate": 0.0002, "epoch": 0.6430155210643016, "step": 290}, {"loss": 1.8152, "grad_norm": 0.32610392570495605, "learning_rate": 0.0002, "epoch": 0.6651884700665188, "step": 300}, {"loss": 1.8412, "grad_norm": 0.40528756380081177, "learning_rate": 0.0002, "epoch": 0.6873614190687362, "step": 310}, {"loss": 1.8528, "grad_norm": 0.34639739990234375, "learning_rate": 0.0002, "epoch": 0.7095343680709535, "step": 320}, {"loss": 1.807, "grad_norm": 0.3794991374015808, "learning_rate": 0.0002, "epoch": 0.7317073170731707, "step": 330}, {"loss": 1.8678, "grad_norm": 0.34203875064849854, "learning_rate": 0.0002, "epoch": 0.753880266075388, "step": 340}, {"loss": 1.7605, "grad_norm": 0.36692821979522705, "learning_rate": 0.0002, "epoch": 0.7760532150776053, "step": 350}, {"loss": 1.7535, "grad_norm": 0.3701125979423523, "learning_rate": 0.0002, "epoch": 0.7982261640798226, "step": 360}, {"loss": 1.7574, "grad_norm": 0.3971416652202606, "learning_rate": 0.0002, "epoch": 0.8203991130820399, "step": 370}, {"loss": 1.8476, "grad_norm": 0.3751989006996155, "learning_rate": 0.0002, "epoch": 0.8425720620842572, "step": 380}, {"loss": 1.8281, "grad_norm": 0.35116496682167053, "learning_rate": 0.0002, "epoch": 0.8647450110864745, "step": 390}, {"loss": 1.7459, "grad_norm": 0.3672674894332886, "learning_rate": 0.0002, "epoch": 0.8869179600886918, "step": 400}, {"loss": 1.8293, "grad_norm": 0.34648260474205017, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 410}, {"loss": 1.7601, "grad_norm": 0.4497389793395996, "learning_rate": 0.0002, "epoch": 0.9312638580931264, "step": 420}, {"loss": 1.8405, "grad_norm": 0.33595147728919983, "learning_rate": 0.0002, "epoch": 0.9534368070953437, "step": 430}, {"loss": 1.8137, "grad_norm": 0.3130456805229187, "learning_rate": 0.0002, "epoch": 0.975609756097561, "step": 440}, {"loss": 1.8226, "grad_norm": 0.36480239033699036, "learning_rate": 0.0002, "epoch": 0.9977827050997783, "step": 450}, {"eval_loss": 1.8323718309402466, "eval_runtime": 79.9603, "eval_samples_per_second": 6.441, "eval_steps_per_second": 0.813, "epoch": 1.0, "step": 451}, {"loss": 1.7921, "grad_norm": 0.3840029835700989, "learning_rate": 0.0002, "epoch": 1.0199556541019956, "step": 460}, {"loss": 1.7057, "grad_norm": 0.33457425236701965, "learning_rate": 0.0002, "epoch": 1.042128603104213, "step": 470}, {"loss": 1.7028, "grad_norm": 0.35766592621803284, "learning_rate": 0.0002, "epoch": 1.06430155210643, "step": 480}, {"loss": 1.7868, "grad_norm": 0.38070937991142273, "learning_rate": 0.0002, "epoch": 1.0864745011086474, "step": 490}, {"loss": 1.72, "grad_norm": 0.38546547293663025, "learning_rate": 0.0002, "epoch": 1.1086474501108647, "step": 500}, {"loss": 1.7088, "grad_norm": 0.384104460477829, "learning_rate": 0.0002, "epoch": 1.130820399113082, "step": 510}, {"loss": 1.7779, "grad_norm": 0.3556116819381714, "learning_rate": 0.0002, "epoch": 1.1529933481152994, "step": 520}, {"loss": 1.7538, "grad_norm": 0.4110541343688965, "learning_rate": 0.0002, "epoch": 1.1751662971175167, "step": 530}, {"loss": 1.7608, "grad_norm": 0.46503177285194397, "learning_rate": 0.0002, "epoch": 1.1973392461197339, "step": 540}, {"loss": 1.729, "grad_norm": 0.4366816580295563, "learning_rate": 0.0002, "epoch": 1.2195121951219512, "step": 550}, {"loss": 1.7954, "grad_norm": 0.379986047744751, "learning_rate": 0.0002, "epoch": 1.2416851441241685, "step": 560}, {"loss": 1.6852, "grad_norm": 0.3920869529247284, "learning_rate": 0.0002, "epoch": 1.2638580931263859, "step": 570}, {"loss": 1.8265, "grad_norm": 0.4013986587524414, "learning_rate": 0.0002, "epoch": 1.2860310421286032, "step": 580}, {"loss": 1.7294, "grad_norm": 0.39104390144348145, "learning_rate": 0.0002, "epoch": 1.3082039911308203, "step": 590}, {"loss": 1.7822, "grad_norm": 0.40515613555908203, "learning_rate": 0.0002, "epoch": 1.3303769401330376, "step": 600}, {"loss": 1.7614, "grad_norm": 0.4212331473827362, "learning_rate": 0.0002, "epoch": 1.352549889135255, "step": 610}, {"loss": 1.7883, "grad_norm": 0.36040815711021423, "learning_rate": 0.0002, "epoch": 1.3747228381374723, "step": 620}, {"loss": 1.7467, "grad_norm": 0.3950865864753723, "learning_rate": 0.0002, "epoch": 1.3968957871396896, "step": 630}, {"loss": 1.7242, "grad_norm": 0.3934709131717682, "learning_rate": 0.0002, "epoch": 1.4190687361419068, "step": 640}, {"loss": 1.7783, "grad_norm": 0.3905350863933563, "learning_rate": 0.0002, "epoch": 1.441241685144124, "step": 650}, {"loss": 1.7612, "grad_norm": 0.4322686493396759, "learning_rate": 0.0002, "epoch": 1.4634146341463414, "step": 660}, {"loss": 1.7357, "grad_norm": 0.35697034001350403, "learning_rate": 0.0002, "epoch": 1.4855875831485588, "step": 670}, {"loss": 1.6764, "grad_norm": 0.38570451736450195, "learning_rate": 0.0002, "epoch": 1.507760532150776, "step": 680}, {"loss": 1.7054, "grad_norm": 0.3804517090320587, "learning_rate": 0.0002, "epoch": 1.5299334811529932, "step": 690}, {"loss": 1.7725, "grad_norm": 0.4938165247440338, "learning_rate": 0.0002, "epoch": 1.5521064301552108, "step": 700}, {"loss": 1.7982, "grad_norm": 0.43075236678123474, "learning_rate": 0.0002, "epoch": 1.5742793791574279, "step": 710}, {"loss": 1.7741, "grad_norm": 0.40434643626213074, "learning_rate": 0.0002, "epoch": 1.5964523281596452, "step": 720}, {"loss": 1.7122, "grad_norm": 0.3874157667160034, "learning_rate": 0.0002, "epoch": 1.6186252771618626, "step": 730}, {"loss": 1.8141, "grad_norm": 0.3645969331264496, "learning_rate": 0.0002, "epoch": 1.6407982261640797, "step": 740}, {"loss": 1.7868, "grad_norm": 0.38588255643844604, "learning_rate": 0.0002, "epoch": 1.6629711751662972, "step": 750}, {"loss": 1.7847, "grad_norm": 0.39252519607543945, "learning_rate": 0.0002, "epoch": 1.6851441241685143, "step": 760}, {"loss": 1.7424, "grad_norm": 0.36165162920951843, "learning_rate": 0.0002, "epoch": 1.7073170731707317, "step": 770}, {"loss": 1.7172, "grad_norm": 0.36677947640419006, "learning_rate": 0.0002, "epoch": 1.729490022172949, "step": 780}, {"loss": 1.7737, "grad_norm": 0.374881774187088, "learning_rate": 0.0002, "epoch": 1.7516629711751663, "step": 790}, {"loss": 1.7331, "grad_norm": 0.4530802369117737, "learning_rate": 0.0002, "epoch": 1.7738359201773837, "step": 800}, {"loss": 1.7799, "grad_norm": 0.3879568576812744, "learning_rate": 0.0002, "epoch": 1.7960088691796008, "step": 810}, {"loss": 1.7562, "grad_norm": 0.3710079789161682, "learning_rate": 0.0002, "epoch": 1.8181818181818183, "step": 820}, {"loss": 1.7728, "grad_norm": 0.3831799030303955, "learning_rate": 0.0002, "epoch": 1.8403547671840355, "step": 830}, {"loss": 1.7605, "grad_norm": 0.3958432376384735, "learning_rate": 0.0002, "epoch": 1.8625277161862528, "step": 840}, {"loss": 1.8187, "grad_norm": 0.4129294157028198, "learning_rate": 0.0002, "epoch": 1.8847006651884701, "step": 850}, {"loss": 1.6506, "grad_norm": 0.3714745044708252, "learning_rate": 0.0002, "epoch": 1.9068736141906872, "step": 860}, {"loss": 1.707, "grad_norm": 0.40176868438720703, "learning_rate": 0.0002, "epoch": 1.9290465631929048, "step": 870}, {"loss": 1.7557, "grad_norm": 0.36937767267227173, "learning_rate": 0.0002, "epoch": 1.951219512195122, "step": 880}, {"loss": 1.7353, "grad_norm": 0.40242597460746765, "learning_rate": 0.0002, "epoch": 1.9733924611973392, "step": 890}, {"loss": 1.7402, "grad_norm": 0.3515510559082031, "learning_rate": 0.0002, "epoch": 1.9955654101995566, "step": 900}]} +{"epoch": 3.0, "step": 1353, "epoch_duration": 1507.103675365448, "total_accumulated_duration": 3821.2082571983337, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.684, "grad_norm": 0.6454975008964539, "learning_rate": 0.0002, "epoch": 0.022172949002217297, "step": 10}, {"loss": 2.2694, "grad_norm": 0.5452715158462524, "learning_rate": 0.0002, "epoch": 0.04434589800443459, "step": 20}, {"loss": 2.0796, "grad_norm": 0.5502195358276367, "learning_rate": 0.0002, "epoch": 0.06651884700665188, "step": 30}, {"loss": 1.9132, "grad_norm": 0.48551198840141296, "learning_rate": 0.0002, "epoch": 0.08869179600886919, "step": 40}, {"loss": 2.016, "grad_norm": 0.47822514176368713, "learning_rate": 0.0002, "epoch": 0.11086474501108648, "step": 50}, {"loss": 1.9455, "grad_norm": 0.5125395655632019, "learning_rate": 0.0002, "epoch": 0.13303769401330376, "step": 60}, {"loss": 1.9017, "grad_norm": 0.4600693881511688, "learning_rate": 0.0002, "epoch": 0.15521064301552107, "step": 70}, {"loss": 2.0057, "grad_norm": 0.7180814743041992, "learning_rate": 0.0002, "epoch": 0.17738359201773837, "step": 80}, {"loss": 1.8925, "grad_norm": 0.4712974429130554, "learning_rate": 0.0002, "epoch": 0.19955654101995565, "step": 90}, {"loss": 1.8386, "grad_norm": 0.4673261344432831, "learning_rate": 0.0002, "epoch": 0.22172949002217296, "step": 100}, {"loss": 1.9346, "grad_norm": 0.4129070043563843, "learning_rate": 0.0002, "epoch": 0.24390243902439024, "step": 110}, {"loss": 1.9018, "grad_norm": 0.3859104812145233, "learning_rate": 0.0002, "epoch": 0.2660753880266075, "step": 120}, {"loss": 1.8922, "grad_norm": 0.40966713428497314, "learning_rate": 0.0002, "epoch": 0.28824833702882485, "step": 130}, {"loss": 1.846, "grad_norm": 0.3685867488384247, "learning_rate": 0.0002, "epoch": 0.31042128603104213, "step": 140}, {"loss": 1.9017, "grad_norm": 0.39279988408088684, "learning_rate": 0.0002, "epoch": 0.3325942350332594, "step": 150}, {"loss": 1.8556, "grad_norm": 0.4195398986339569, "learning_rate": 0.0002, "epoch": 0.35476718403547675, "step": 160}, {"loss": 1.7883, "grad_norm": 0.469802588224411, "learning_rate": 0.0002, "epoch": 0.376940133037694, "step": 170}, {"loss": 1.8135, "grad_norm": 0.4069509208202362, "learning_rate": 0.0002, "epoch": 0.3991130820399113, "step": 180}, {"loss": 1.8429, "grad_norm": 0.47832027077674866, "learning_rate": 0.0002, "epoch": 0.4212860310421286, "step": 190}, {"loss": 1.781, "grad_norm": 0.3376411199569702, "learning_rate": 0.0002, "epoch": 0.4434589800443459, "step": 200}, {"loss": 1.8562, "grad_norm": 0.3787185847759247, "learning_rate": 0.0002, "epoch": 0.4656319290465632, "step": 210}, {"loss": 1.8352, "grad_norm": 0.40322697162628174, "learning_rate": 0.0002, "epoch": 0.4878048780487805, "step": 220}, {"loss": 1.8007, "grad_norm": 0.3710436522960663, "learning_rate": 0.0002, "epoch": 0.5099778270509978, "step": 230}, {"loss": 1.8528, "grad_norm": 0.3723200261592865, "learning_rate": 0.0002, "epoch": 0.532150776053215, "step": 240}, {"loss": 1.852, "grad_norm": 0.3457179069519043, "learning_rate": 0.0002, "epoch": 0.5543237250554324, "step": 250}, {"loss": 1.8175, "grad_norm": 0.35369473695755005, "learning_rate": 0.0002, "epoch": 0.5764966740576497, "step": 260}, {"loss": 1.7742, "grad_norm": 0.3667483329772949, "learning_rate": 0.0002, "epoch": 0.5986696230598669, "step": 270}, {"loss": 1.8152, "grad_norm": 0.4023273289203644, "learning_rate": 0.0002, "epoch": 0.6208425720620843, "step": 280}, {"loss": 1.8382, "grad_norm": 0.3601929843425751, "learning_rate": 0.0002, "epoch": 0.6430155210643016, "step": 290}, {"loss": 1.8152, "grad_norm": 0.32610392570495605, "learning_rate": 0.0002, "epoch": 0.6651884700665188, "step": 300}, {"loss": 1.8412, "grad_norm": 0.40528756380081177, "learning_rate": 0.0002, "epoch": 0.6873614190687362, "step": 310}, {"loss": 1.8528, "grad_norm": 0.34639739990234375, "learning_rate": 0.0002, "epoch": 0.7095343680709535, "step": 320}, {"loss": 1.807, "grad_norm": 0.3794991374015808, "learning_rate": 0.0002, "epoch": 0.7317073170731707, "step": 330}, {"loss": 1.8678, "grad_norm": 0.34203875064849854, "learning_rate": 0.0002, "epoch": 0.753880266075388, "step": 340}, {"loss": 1.7605, "grad_norm": 0.36692821979522705, "learning_rate": 0.0002, "epoch": 0.7760532150776053, "step": 350}, {"loss": 1.7535, "grad_norm": 0.3701125979423523, "learning_rate": 0.0002, "epoch": 0.7982261640798226, "step": 360}, {"loss": 1.7574, "grad_norm": 0.3971416652202606, "learning_rate": 0.0002, "epoch": 0.8203991130820399, "step": 370}, {"loss": 1.8476, "grad_norm": 0.3751989006996155, "learning_rate": 0.0002, "epoch": 0.8425720620842572, "step": 380}, {"loss": 1.8281, "grad_norm": 0.35116496682167053, "learning_rate": 0.0002, "epoch": 0.8647450110864745, "step": 390}, {"loss": 1.7459, "grad_norm": 0.3672674894332886, "learning_rate": 0.0002, "epoch": 0.8869179600886918, "step": 400}, {"loss": 1.8293, "grad_norm": 0.34648260474205017, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 410}, {"loss": 1.7601, "grad_norm": 0.4497389793395996, "learning_rate": 0.0002, "epoch": 0.9312638580931264, "step": 420}, {"loss": 1.8405, "grad_norm": 0.33595147728919983, "learning_rate": 0.0002, "epoch": 0.9534368070953437, "step": 430}, {"loss": 1.8137, "grad_norm": 0.3130456805229187, "learning_rate": 0.0002, "epoch": 0.975609756097561, "step": 440}, {"loss": 1.8226, "grad_norm": 0.36480239033699036, "learning_rate": 0.0002, "epoch": 0.9977827050997783, "step": 450}, {"eval_loss": 1.8323718309402466, "eval_runtime": 79.9603, "eval_samples_per_second": 6.441, "eval_steps_per_second": 0.813, "epoch": 1.0, "step": 451}, {"loss": 1.7921, "grad_norm": 0.3840029835700989, "learning_rate": 0.0002, "epoch": 1.0199556541019956, "step": 460}, {"loss": 1.7057, "grad_norm": 0.33457425236701965, "learning_rate": 0.0002, "epoch": 1.042128603104213, "step": 470}, {"loss": 1.7028, "grad_norm": 0.35766592621803284, "learning_rate": 0.0002, "epoch": 1.06430155210643, "step": 480}, {"loss": 1.7868, "grad_norm": 0.38070937991142273, "learning_rate": 0.0002, "epoch": 1.0864745011086474, "step": 490}, {"loss": 1.72, "grad_norm": 0.38546547293663025, "learning_rate": 0.0002, "epoch": 1.1086474501108647, "step": 500}, {"loss": 1.7088, "grad_norm": 0.384104460477829, "learning_rate": 0.0002, "epoch": 1.130820399113082, "step": 510}, {"loss": 1.7779, "grad_norm": 0.3556116819381714, "learning_rate": 0.0002, "epoch": 1.1529933481152994, "step": 520}, {"loss": 1.7538, "grad_norm": 0.4110541343688965, "learning_rate": 0.0002, "epoch": 1.1751662971175167, "step": 530}, {"loss": 1.7608, "grad_norm": 0.46503177285194397, "learning_rate": 0.0002, "epoch": 1.1973392461197339, "step": 540}, {"loss": 1.729, "grad_norm": 0.4366816580295563, "learning_rate": 0.0002, "epoch": 1.2195121951219512, "step": 550}, {"loss": 1.7954, "grad_norm": 0.379986047744751, "learning_rate": 0.0002, "epoch": 1.2416851441241685, "step": 560}, {"loss": 1.6852, "grad_norm": 0.3920869529247284, "learning_rate": 0.0002, "epoch": 1.2638580931263859, "step": 570}, {"loss": 1.8265, "grad_norm": 0.4013986587524414, "learning_rate": 0.0002, "epoch": 1.2860310421286032, "step": 580}, {"loss": 1.7294, "grad_norm": 0.39104390144348145, "learning_rate": 0.0002, "epoch": 1.3082039911308203, "step": 590}, {"loss": 1.7822, "grad_norm": 0.40515613555908203, "learning_rate": 0.0002, "epoch": 1.3303769401330376, "step": 600}, {"loss": 1.7614, "grad_norm": 0.4212331473827362, "learning_rate": 0.0002, "epoch": 1.352549889135255, "step": 610}, {"loss": 1.7883, "grad_norm": 0.36040815711021423, "learning_rate": 0.0002, "epoch": 1.3747228381374723, "step": 620}, {"loss": 1.7467, "grad_norm": 0.3950865864753723, "learning_rate": 0.0002, "epoch": 1.3968957871396896, "step": 630}, {"loss": 1.7242, "grad_norm": 0.3934709131717682, "learning_rate": 0.0002, "epoch": 1.4190687361419068, "step": 640}, {"loss": 1.7783, "grad_norm": 0.3905350863933563, "learning_rate": 0.0002, "epoch": 1.441241685144124, "step": 650}, {"loss": 1.7612, "grad_norm": 0.4322686493396759, "learning_rate": 0.0002, "epoch": 1.4634146341463414, "step": 660}, {"loss": 1.7357, "grad_norm": 0.35697034001350403, "learning_rate": 0.0002, "epoch": 1.4855875831485588, "step": 670}, {"loss": 1.6764, "grad_norm": 0.38570451736450195, "learning_rate": 0.0002, "epoch": 1.507760532150776, "step": 680}, {"loss": 1.7054, "grad_norm": 0.3804517090320587, "learning_rate": 0.0002, "epoch": 1.5299334811529932, "step": 690}, {"loss": 1.7725, "grad_norm": 0.4938165247440338, "learning_rate": 0.0002, "epoch": 1.5521064301552108, "step": 700}, {"loss": 1.7982, "grad_norm": 0.43075236678123474, "learning_rate": 0.0002, "epoch": 1.5742793791574279, "step": 710}, {"loss": 1.7741, "grad_norm": 0.40434643626213074, "learning_rate": 0.0002, "epoch": 1.5964523281596452, "step": 720}, {"loss": 1.7122, "grad_norm": 0.3874157667160034, "learning_rate": 0.0002, "epoch": 1.6186252771618626, "step": 730}, {"loss": 1.8141, "grad_norm": 0.3645969331264496, "learning_rate": 0.0002, "epoch": 1.6407982261640797, "step": 740}, {"loss": 1.7868, "grad_norm": 0.38588255643844604, "learning_rate": 0.0002, "epoch": 1.6629711751662972, "step": 750}, {"loss": 1.7847, "grad_norm": 0.39252519607543945, "learning_rate": 0.0002, "epoch": 1.6851441241685143, "step": 760}, {"loss": 1.7424, "grad_norm": 0.36165162920951843, "learning_rate": 0.0002, "epoch": 1.7073170731707317, "step": 770}, {"loss": 1.7172, "grad_norm": 0.36677947640419006, "learning_rate": 0.0002, "epoch": 1.729490022172949, "step": 780}, {"loss": 1.7737, "grad_norm": 0.374881774187088, "learning_rate": 0.0002, "epoch": 1.7516629711751663, "step": 790}, {"loss": 1.7331, "grad_norm": 0.4530802369117737, "learning_rate": 0.0002, "epoch": 1.7738359201773837, "step": 800}, {"loss": 1.7799, "grad_norm": 0.3879568576812744, "learning_rate": 0.0002, "epoch": 1.7960088691796008, "step": 810}, {"loss": 1.7562, "grad_norm": 0.3710079789161682, "learning_rate": 0.0002, "epoch": 1.8181818181818183, "step": 820}, {"loss": 1.7728, "grad_norm": 0.3831799030303955, "learning_rate": 0.0002, "epoch": 1.8403547671840355, "step": 830}, {"loss": 1.7605, "grad_norm": 0.3958432376384735, "learning_rate": 0.0002, "epoch": 1.8625277161862528, "step": 840}, {"loss": 1.8187, "grad_norm": 0.4129294157028198, "learning_rate": 0.0002, "epoch": 1.8847006651884701, "step": 850}, {"loss": 1.6506, "grad_norm": 0.3714745044708252, "learning_rate": 0.0002, "epoch": 1.9068736141906872, "step": 860}, {"loss": 1.707, "grad_norm": 0.40176868438720703, "learning_rate": 0.0002, "epoch": 1.9290465631929048, "step": 870}, {"loss": 1.7557, "grad_norm": 0.36937767267227173, "learning_rate": 0.0002, "epoch": 1.951219512195122, "step": 880}, {"loss": 1.7353, "grad_norm": 0.40242597460746765, "learning_rate": 0.0002, "epoch": 1.9733924611973392, "step": 890}, {"loss": 1.7402, "grad_norm": 0.3515510559082031, "learning_rate": 0.0002, "epoch": 1.9955654101995566, "step": 900}, {"eval_loss": 1.8243104219436646, "eval_runtime": 107.8856, "eval_samples_per_second": 4.774, "eval_steps_per_second": 0.602, "epoch": 2.0, "step": 902}, {"loss": 1.6631, "grad_norm": 0.4145216643810272, "learning_rate": 0.0002, "epoch": 2.0177383592017737, "step": 910}, {"loss": 1.7227, "grad_norm": 0.42423519492149353, "learning_rate": 0.0002, "epoch": 2.0399113082039912, "step": 920}, {"loss": 1.6357, "grad_norm": 0.4773229956626892, "learning_rate": 0.0002, "epoch": 2.0620842572062084, "step": 930}, {"loss": 1.671, "grad_norm": 0.4144791066646576, "learning_rate": 0.0002, "epoch": 2.084257206208426, "step": 940}, {"loss": 1.6433, "grad_norm": 0.42704132199287415, "learning_rate": 0.0002, "epoch": 2.106430155210643, "step": 950}, {"loss": 1.6767, "grad_norm": 0.4479042589664459, "learning_rate": 0.0002, "epoch": 2.12860310421286, "step": 960}, {"loss": 1.6122, "grad_norm": 0.4810638129711151, "learning_rate": 0.0002, "epoch": 2.1507760532150777, "step": 970}, {"loss": 1.6613, "grad_norm": 0.48669910430908203, "learning_rate": 0.0002, "epoch": 2.172949002217295, "step": 980}, {"loss": 1.6274, "grad_norm": 0.4252761900424957, "learning_rate": 0.0002, "epoch": 2.1951219512195124, "step": 990}, {"loss": 1.6514, "grad_norm": 0.42342790961265564, "learning_rate": 0.0002, "epoch": 2.2172949002217295, "step": 1000}, {"loss": 1.637, "grad_norm": 0.43432456254959106, "learning_rate": 0.0002, "epoch": 2.2394678492239466, "step": 1010}, {"loss": 1.6856, "grad_norm": 0.45556965470314026, "learning_rate": 0.0002, "epoch": 2.261640798226164, "step": 1020}, {"loss": 1.6554, "grad_norm": 0.48035719990730286, "learning_rate": 0.0002, "epoch": 2.2838137472283813, "step": 1030}, {"loss": 1.6947, "grad_norm": 0.4233241081237793, "learning_rate": 0.0002, "epoch": 2.305986696230599, "step": 1040}, {"loss": 1.6501, "grad_norm": 0.3918434679508209, "learning_rate": 0.0002, "epoch": 2.328159645232816, "step": 1050}, {"loss": 1.6903, "grad_norm": 0.44049757719039917, "learning_rate": 0.0002, "epoch": 2.3503325942350335, "step": 1060}, {"loss": 1.6337, "grad_norm": 0.4730056822299957, "learning_rate": 0.0002, "epoch": 2.3725055432372506, "step": 1070}, {"loss": 1.6104, "grad_norm": 0.4354589581489563, "learning_rate": 0.0002, "epoch": 2.3946784922394677, "step": 1080}, {"loss": 1.6496, "grad_norm": 0.4837590456008911, "learning_rate": 0.0002, "epoch": 2.4168514412416853, "step": 1090}, {"loss": 1.6672, "grad_norm": 0.4842571020126343, "learning_rate": 0.0002, "epoch": 2.4390243902439024, "step": 1100}, {"loss": 1.6091, "grad_norm": 0.46398279070854187, "learning_rate": 0.0002, "epoch": 2.4611973392461195, "step": 1110}, {"loss": 1.6393, "grad_norm": 0.4587327539920807, "learning_rate": 0.0002, "epoch": 2.483370288248337, "step": 1120}, {"loss": 1.621, "grad_norm": 0.4336528480052948, "learning_rate": 0.0002, "epoch": 2.505543237250554, "step": 1130}, {"loss": 1.6199, "grad_norm": 0.6162153482437134, "learning_rate": 0.0002, "epoch": 2.5277161862527717, "step": 1140}, {"loss": 1.7244, "grad_norm": 0.48175573348999023, "learning_rate": 0.0002, "epoch": 2.549889135254989, "step": 1150}, {"loss": 1.6098, "grad_norm": 0.448272705078125, "learning_rate": 0.0002, "epoch": 2.5720620842572064, "step": 1160}, {"loss": 1.6987, "grad_norm": 0.5189200639724731, "learning_rate": 0.0002, "epoch": 2.5942350332594235, "step": 1170}, {"loss": 1.6503, "grad_norm": 0.45032963156700134, "learning_rate": 0.0002, "epoch": 2.6164079822616406, "step": 1180}, {"loss": 1.6508, "grad_norm": 0.4417729377746582, "learning_rate": 0.0002, "epoch": 2.638580931263858, "step": 1190}, {"loss": 1.6084, "grad_norm": 0.5219636559486389, "learning_rate": 0.0002, "epoch": 2.6607538802660753, "step": 1200}, {"loss": 1.6121, "grad_norm": 0.47702011466026306, "learning_rate": 0.0002, "epoch": 2.682926829268293, "step": 1210}, {"loss": 1.6942, "grad_norm": 0.4328458607196808, "learning_rate": 0.0002, "epoch": 2.70509977827051, "step": 1220}, {"loss": 1.6099, "grad_norm": 0.46762076020240784, "learning_rate": 0.0002, "epoch": 2.7272727272727275, "step": 1230}, {"loss": 1.7287, "grad_norm": 0.4592697322368622, "learning_rate": 0.0002, "epoch": 2.7494456762749446, "step": 1240}, {"loss": 1.617, "grad_norm": 0.5519265532493591, "learning_rate": 0.0002, "epoch": 2.7716186252771617, "step": 1250}, {"loss": 1.6868, "grad_norm": 0.47169506549835205, "learning_rate": 0.0002, "epoch": 2.7937915742793793, "step": 1260}, {"loss": 1.658, "grad_norm": 0.47231653332710266, "learning_rate": 0.0002, "epoch": 2.8159645232815964, "step": 1270}, {"loss": 1.6738, "grad_norm": 0.49081969261169434, "learning_rate": 0.0002, "epoch": 2.8381374722838135, "step": 1280}, {"loss": 1.7248, "grad_norm": 0.4483231008052826, "learning_rate": 0.0002, "epoch": 2.860310421286031, "step": 1290}, {"loss": 1.6428, "grad_norm": 0.5310035943984985, "learning_rate": 0.0002, "epoch": 2.882483370288248, "step": 1300}, {"loss": 1.6515, "grad_norm": 0.4419795572757721, "learning_rate": 0.0002, "epoch": 2.9046563192904657, "step": 1310}, {"loss": 1.6956, "grad_norm": 0.44630762934684753, "learning_rate": 0.0002, "epoch": 2.926829268292683, "step": 1320}, {"loss": 1.6464, "grad_norm": 0.39774850010871887, "learning_rate": 0.0002, "epoch": 2.9490022172949004, "step": 1330}, {"loss": 1.7007, "grad_norm": 0.441727876663208, "learning_rate": 0.0002, "epoch": 2.9711751662971175, "step": 1340}, {"loss": 1.5968, "grad_norm": 0.43773892521858215, "learning_rate": 0.0002, "epoch": 2.9933481152993346, "step": 1350}]} +{"epoch": 4.0, "step": 1804, "epoch_duration": 1185.2760462760925, "total_accumulated_duration": 5006.484303474426, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.684, "grad_norm": 0.6454975008964539, "learning_rate": 0.0002, "epoch": 0.022172949002217297, "step": 10}, {"loss": 2.2694, "grad_norm": 0.5452715158462524, "learning_rate": 0.0002, "epoch": 0.04434589800443459, "step": 20}, {"loss": 2.0796, "grad_norm": 0.5502195358276367, "learning_rate": 0.0002, "epoch": 0.06651884700665188, "step": 30}, {"loss": 1.9132, "grad_norm": 0.48551198840141296, "learning_rate": 0.0002, "epoch": 0.08869179600886919, "step": 40}, {"loss": 2.016, "grad_norm": 0.47822514176368713, "learning_rate": 0.0002, "epoch": 0.11086474501108648, "step": 50}, {"loss": 1.9455, "grad_norm": 0.5125395655632019, "learning_rate": 0.0002, "epoch": 0.13303769401330376, "step": 60}, {"loss": 1.9017, "grad_norm": 0.4600693881511688, "learning_rate": 0.0002, "epoch": 0.15521064301552107, "step": 70}, {"loss": 2.0057, "grad_norm": 0.7180814743041992, "learning_rate": 0.0002, "epoch": 0.17738359201773837, "step": 80}, {"loss": 1.8925, "grad_norm": 0.4712974429130554, "learning_rate": 0.0002, "epoch": 0.19955654101995565, "step": 90}, {"loss": 1.8386, "grad_norm": 0.4673261344432831, "learning_rate": 0.0002, "epoch": 0.22172949002217296, "step": 100}, {"loss": 1.9346, "grad_norm": 0.4129070043563843, "learning_rate": 0.0002, "epoch": 0.24390243902439024, "step": 110}, {"loss": 1.9018, "grad_norm": 0.3859104812145233, "learning_rate": 0.0002, "epoch": 0.2660753880266075, "step": 120}, {"loss": 1.8922, "grad_norm": 0.40966713428497314, "learning_rate": 0.0002, "epoch": 0.28824833702882485, "step": 130}, {"loss": 1.846, "grad_norm": 0.3685867488384247, "learning_rate": 0.0002, "epoch": 0.31042128603104213, "step": 140}, {"loss": 1.9017, "grad_norm": 0.39279988408088684, "learning_rate": 0.0002, "epoch": 0.3325942350332594, "step": 150}, {"loss": 1.8556, "grad_norm": 0.4195398986339569, "learning_rate": 0.0002, "epoch": 0.35476718403547675, "step": 160}, {"loss": 1.7883, "grad_norm": 0.469802588224411, "learning_rate": 0.0002, "epoch": 0.376940133037694, "step": 170}, {"loss": 1.8135, "grad_norm": 0.4069509208202362, "learning_rate": 0.0002, "epoch": 0.3991130820399113, "step": 180}, {"loss": 1.8429, "grad_norm": 0.47832027077674866, "learning_rate": 0.0002, "epoch": 0.4212860310421286, "step": 190}, {"loss": 1.781, "grad_norm": 0.3376411199569702, "learning_rate": 0.0002, "epoch": 0.4434589800443459, "step": 200}, {"loss": 1.8562, "grad_norm": 0.3787185847759247, "learning_rate": 0.0002, "epoch": 0.4656319290465632, "step": 210}, {"loss": 1.8352, "grad_norm": 0.40322697162628174, "learning_rate": 0.0002, "epoch": 0.4878048780487805, "step": 220}, {"loss": 1.8007, "grad_norm": 0.3710436522960663, "learning_rate": 0.0002, "epoch": 0.5099778270509978, "step": 230}, {"loss": 1.8528, "grad_norm": 0.3723200261592865, "learning_rate": 0.0002, "epoch": 0.532150776053215, "step": 240}, {"loss": 1.852, "grad_norm": 0.3457179069519043, "learning_rate": 0.0002, "epoch": 0.5543237250554324, "step": 250}, {"loss": 1.8175, "grad_norm": 0.35369473695755005, "learning_rate": 0.0002, "epoch": 0.5764966740576497, "step": 260}, {"loss": 1.7742, "grad_norm": 0.3667483329772949, "learning_rate": 0.0002, "epoch": 0.5986696230598669, "step": 270}, {"loss": 1.8152, "grad_norm": 0.4023273289203644, "learning_rate": 0.0002, "epoch": 0.6208425720620843, "step": 280}, {"loss": 1.8382, "grad_norm": 0.3601929843425751, "learning_rate": 0.0002, "epoch": 0.6430155210643016, "step": 290}, {"loss": 1.8152, "grad_norm": 0.32610392570495605, "learning_rate": 0.0002, "epoch": 0.6651884700665188, "step": 300}, {"loss": 1.8412, "grad_norm": 0.40528756380081177, "learning_rate": 0.0002, "epoch": 0.6873614190687362, "step": 310}, {"loss": 1.8528, "grad_norm": 0.34639739990234375, "learning_rate": 0.0002, "epoch": 0.7095343680709535, "step": 320}, {"loss": 1.807, "grad_norm": 0.3794991374015808, "learning_rate": 0.0002, "epoch": 0.7317073170731707, "step": 330}, {"loss": 1.8678, "grad_norm": 0.34203875064849854, "learning_rate": 0.0002, "epoch": 0.753880266075388, "step": 340}, {"loss": 1.7605, "grad_norm": 0.36692821979522705, "learning_rate": 0.0002, "epoch": 0.7760532150776053, "step": 350}, {"loss": 1.7535, "grad_norm": 0.3701125979423523, "learning_rate": 0.0002, "epoch": 0.7982261640798226, "step": 360}, {"loss": 1.7574, "grad_norm": 0.3971416652202606, "learning_rate": 0.0002, "epoch": 0.8203991130820399, "step": 370}, {"loss": 1.8476, "grad_norm": 0.3751989006996155, "learning_rate": 0.0002, "epoch": 0.8425720620842572, "step": 380}, {"loss": 1.8281, "grad_norm": 0.35116496682167053, "learning_rate": 0.0002, "epoch": 0.8647450110864745, "step": 390}, {"loss": 1.7459, "grad_norm": 0.3672674894332886, "learning_rate": 0.0002, "epoch": 0.8869179600886918, "step": 400}, {"loss": 1.8293, "grad_norm": 0.34648260474205017, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 410}, {"loss": 1.7601, "grad_norm": 0.4497389793395996, "learning_rate": 0.0002, "epoch": 0.9312638580931264, "step": 420}, {"loss": 1.8405, "grad_norm": 0.33595147728919983, "learning_rate": 0.0002, "epoch": 0.9534368070953437, "step": 430}, {"loss": 1.8137, "grad_norm": 0.3130456805229187, "learning_rate": 0.0002, "epoch": 0.975609756097561, "step": 440}, {"loss": 1.8226, "grad_norm": 0.36480239033699036, "learning_rate": 0.0002, "epoch": 0.9977827050997783, "step": 450}, {"eval_loss": 1.8323718309402466, "eval_runtime": 79.9603, "eval_samples_per_second": 6.441, "eval_steps_per_second": 0.813, "epoch": 1.0, "step": 451}, {"loss": 1.7921, "grad_norm": 0.3840029835700989, "learning_rate": 0.0002, "epoch": 1.0199556541019956, "step": 460}, {"loss": 1.7057, "grad_norm": 0.33457425236701965, "learning_rate": 0.0002, "epoch": 1.042128603104213, "step": 470}, {"loss": 1.7028, "grad_norm": 0.35766592621803284, "learning_rate": 0.0002, "epoch": 1.06430155210643, "step": 480}, {"loss": 1.7868, "grad_norm": 0.38070937991142273, "learning_rate": 0.0002, "epoch": 1.0864745011086474, "step": 490}, {"loss": 1.72, "grad_norm": 0.38546547293663025, "learning_rate": 0.0002, "epoch": 1.1086474501108647, "step": 500}, {"loss": 1.7088, "grad_norm": 0.384104460477829, "learning_rate": 0.0002, "epoch": 1.130820399113082, "step": 510}, {"loss": 1.7779, "grad_norm": 0.3556116819381714, "learning_rate": 0.0002, "epoch": 1.1529933481152994, "step": 520}, {"loss": 1.7538, "grad_norm": 0.4110541343688965, "learning_rate": 0.0002, "epoch": 1.1751662971175167, "step": 530}, {"loss": 1.7608, "grad_norm": 0.46503177285194397, "learning_rate": 0.0002, "epoch": 1.1973392461197339, "step": 540}, {"loss": 1.729, "grad_norm": 0.4366816580295563, "learning_rate": 0.0002, "epoch": 1.2195121951219512, "step": 550}, {"loss": 1.7954, "grad_norm": 0.379986047744751, "learning_rate": 0.0002, "epoch": 1.2416851441241685, "step": 560}, {"loss": 1.6852, "grad_norm": 0.3920869529247284, "learning_rate": 0.0002, "epoch": 1.2638580931263859, "step": 570}, {"loss": 1.8265, "grad_norm": 0.4013986587524414, "learning_rate": 0.0002, "epoch": 1.2860310421286032, "step": 580}, {"loss": 1.7294, "grad_norm": 0.39104390144348145, "learning_rate": 0.0002, "epoch": 1.3082039911308203, "step": 590}, {"loss": 1.7822, "grad_norm": 0.40515613555908203, "learning_rate": 0.0002, "epoch": 1.3303769401330376, "step": 600}, {"loss": 1.7614, "grad_norm": 0.4212331473827362, "learning_rate": 0.0002, "epoch": 1.352549889135255, "step": 610}, {"loss": 1.7883, "grad_norm": 0.36040815711021423, "learning_rate": 0.0002, "epoch": 1.3747228381374723, "step": 620}, {"loss": 1.7467, "grad_norm": 0.3950865864753723, "learning_rate": 0.0002, "epoch": 1.3968957871396896, "step": 630}, {"loss": 1.7242, "grad_norm": 0.3934709131717682, "learning_rate": 0.0002, "epoch": 1.4190687361419068, "step": 640}, {"loss": 1.7783, "grad_norm": 0.3905350863933563, "learning_rate": 0.0002, "epoch": 1.441241685144124, "step": 650}, {"loss": 1.7612, "grad_norm": 0.4322686493396759, "learning_rate": 0.0002, "epoch": 1.4634146341463414, "step": 660}, {"loss": 1.7357, "grad_norm": 0.35697034001350403, "learning_rate": 0.0002, "epoch": 1.4855875831485588, "step": 670}, {"loss": 1.6764, "grad_norm": 0.38570451736450195, "learning_rate": 0.0002, "epoch": 1.507760532150776, "step": 680}, {"loss": 1.7054, "grad_norm": 0.3804517090320587, "learning_rate": 0.0002, "epoch": 1.5299334811529932, "step": 690}, {"loss": 1.7725, "grad_norm": 0.4938165247440338, "learning_rate": 0.0002, "epoch": 1.5521064301552108, "step": 700}, {"loss": 1.7982, "grad_norm": 0.43075236678123474, "learning_rate": 0.0002, "epoch": 1.5742793791574279, "step": 710}, {"loss": 1.7741, "grad_norm": 0.40434643626213074, "learning_rate": 0.0002, "epoch": 1.5964523281596452, "step": 720}, {"loss": 1.7122, "grad_norm": 0.3874157667160034, "learning_rate": 0.0002, "epoch": 1.6186252771618626, "step": 730}, {"loss": 1.8141, "grad_norm": 0.3645969331264496, "learning_rate": 0.0002, "epoch": 1.6407982261640797, "step": 740}, {"loss": 1.7868, "grad_norm": 0.38588255643844604, "learning_rate": 0.0002, "epoch": 1.6629711751662972, "step": 750}, {"loss": 1.7847, "grad_norm": 0.39252519607543945, "learning_rate": 0.0002, "epoch": 1.6851441241685143, "step": 760}, {"loss": 1.7424, "grad_norm": 0.36165162920951843, "learning_rate": 0.0002, "epoch": 1.7073170731707317, "step": 770}, {"loss": 1.7172, "grad_norm": 0.36677947640419006, "learning_rate": 0.0002, "epoch": 1.729490022172949, "step": 780}, {"loss": 1.7737, "grad_norm": 0.374881774187088, "learning_rate": 0.0002, "epoch": 1.7516629711751663, "step": 790}, {"loss": 1.7331, "grad_norm": 0.4530802369117737, "learning_rate": 0.0002, "epoch": 1.7738359201773837, "step": 800}, {"loss": 1.7799, "grad_norm": 0.3879568576812744, "learning_rate": 0.0002, "epoch": 1.7960088691796008, "step": 810}, {"loss": 1.7562, "grad_norm": 0.3710079789161682, "learning_rate": 0.0002, "epoch": 1.8181818181818183, "step": 820}, {"loss": 1.7728, "grad_norm": 0.3831799030303955, "learning_rate": 0.0002, "epoch": 1.8403547671840355, "step": 830}, {"loss": 1.7605, "grad_norm": 0.3958432376384735, "learning_rate": 0.0002, "epoch": 1.8625277161862528, "step": 840}, {"loss": 1.8187, "grad_norm": 0.4129294157028198, "learning_rate": 0.0002, "epoch": 1.8847006651884701, "step": 850}, {"loss": 1.6506, "grad_norm": 0.3714745044708252, "learning_rate": 0.0002, "epoch": 1.9068736141906872, "step": 860}, {"loss": 1.707, "grad_norm": 0.40176868438720703, "learning_rate": 0.0002, "epoch": 1.9290465631929048, "step": 870}, {"loss": 1.7557, "grad_norm": 0.36937767267227173, "learning_rate": 0.0002, "epoch": 1.951219512195122, "step": 880}, {"loss": 1.7353, "grad_norm": 0.40242597460746765, "learning_rate": 0.0002, "epoch": 1.9733924611973392, "step": 890}, {"loss": 1.7402, "grad_norm": 0.3515510559082031, "learning_rate": 0.0002, "epoch": 1.9955654101995566, "step": 900}, {"eval_loss": 1.8243104219436646, "eval_runtime": 107.8856, "eval_samples_per_second": 4.774, "eval_steps_per_second": 0.602, "epoch": 2.0, "step": 902}, {"loss": 1.6631, "grad_norm": 0.4145216643810272, "learning_rate": 0.0002, "epoch": 2.0177383592017737, "step": 910}, {"loss": 1.7227, "grad_norm": 0.42423519492149353, "learning_rate": 0.0002, "epoch": 2.0399113082039912, "step": 920}, {"loss": 1.6357, "grad_norm": 0.4773229956626892, "learning_rate": 0.0002, "epoch": 2.0620842572062084, "step": 930}, {"loss": 1.671, "grad_norm": 0.4144791066646576, "learning_rate": 0.0002, "epoch": 2.084257206208426, "step": 940}, {"loss": 1.6433, "grad_norm": 0.42704132199287415, "learning_rate": 0.0002, "epoch": 2.106430155210643, "step": 950}, {"loss": 1.6767, "grad_norm": 0.4479042589664459, "learning_rate": 0.0002, "epoch": 2.12860310421286, "step": 960}, {"loss": 1.6122, "grad_norm": 0.4810638129711151, "learning_rate": 0.0002, "epoch": 2.1507760532150777, "step": 970}, {"loss": 1.6613, "grad_norm": 0.48669910430908203, "learning_rate": 0.0002, "epoch": 2.172949002217295, "step": 980}, {"loss": 1.6274, "grad_norm": 0.4252761900424957, "learning_rate": 0.0002, "epoch": 2.1951219512195124, "step": 990}, {"loss": 1.6514, "grad_norm": 0.42342790961265564, "learning_rate": 0.0002, "epoch": 2.2172949002217295, "step": 1000}, {"loss": 1.637, "grad_norm": 0.43432456254959106, "learning_rate": 0.0002, "epoch": 2.2394678492239466, "step": 1010}, {"loss": 1.6856, "grad_norm": 0.45556965470314026, "learning_rate": 0.0002, "epoch": 2.261640798226164, "step": 1020}, {"loss": 1.6554, "grad_norm": 0.48035719990730286, "learning_rate": 0.0002, "epoch": 2.2838137472283813, "step": 1030}, {"loss": 1.6947, "grad_norm": 0.4233241081237793, "learning_rate": 0.0002, "epoch": 2.305986696230599, "step": 1040}, {"loss": 1.6501, "grad_norm": 0.3918434679508209, "learning_rate": 0.0002, "epoch": 2.328159645232816, "step": 1050}, {"loss": 1.6903, "grad_norm": 0.44049757719039917, "learning_rate": 0.0002, "epoch": 2.3503325942350335, "step": 1060}, {"loss": 1.6337, "grad_norm": 0.4730056822299957, "learning_rate": 0.0002, "epoch": 2.3725055432372506, "step": 1070}, {"loss": 1.6104, "grad_norm": 0.4354589581489563, "learning_rate": 0.0002, "epoch": 2.3946784922394677, "step": 1080}, {"loss": 1.6496, "grad_norm": 0.4837590456008911, "learning_rate": 0.0002, "epoch": 2.4168514412416853, "step": 1090}, {"loss": 1.6672, "grad_norm": 0.4842571020126343, "learning_rate": 0.0002, "epoch": 2.4390243902439024, "step": 1100}, {"loss": 1.6091, "grad_norm": 0.46398279070854187, "learning_rate": 0.0002, "epoch": 2.4611973392461195, "step": 1110}, {"loss": 1.6393, "grad_norm": 0.4587327539920807, "learning_rate": 0.0002, "epoch": 2.483370288248337, "step": 1120}, {"loss": 1.621, "grad_norm": 0.4336528480052948, "learning_rate": 0.0002, "epoch": 2.505543237250554, "step": 1130}, {"loss": 1.6199, "grad_norm": 0.6162153482437134, "learning_rate": 0.0002, "epoch": 2.5277161862527717, "step": 1140}, {"loss": 1.7244, "grad_norm": 0.48175573348999023, "learning_rate": 0.0002, "epoch": 2.549889135254989, "step": 1150}, {"loss": 1.6098, "grad_norm": 0.448272705078125, "learning_rate": 0.0002, "epoch": 2.5720620842572064, "step": 1160}, {"loss": 1.6987, "grad_norm": 0.5189200639724731, "learning_rate": 0.0002, "epoch": 2.5942350332594235, "step": 1170}, {"loss": 1.6503, "grad_norm": 0.45032963156700134, "learning_rate": 0.0002, "epoch": 2.6164079822616406, "step": 1180}, {"loss": 1.6508, "grad_norm": 0.4417729377746582, "learning_rate": 0.0002, "epoch": 2.638580931263858, "step": 1190}, {"loss": 1.6084, "grad_norm": 0.5219636559486389, "learning_rate": 0.0002, "epoch": 2.6607538802660753, "step": 1200}, {"loss": 1.6121, "grad_norm": 0.47702011466026306, "learning_rate": 0.0002, "epoch": 2.682926829268293, "step": 1210}, {"loss": 1.6942, "grad_norm": 0.4328458607196808, "learning_rate": 0.0002, "epoch": 2.70509977827051, "step": 1220}, {"loss": 1.6099, "grad_norm": 0.46762076020240784, "learning_rate": 0.0002, "epoch": 2.7272727272727275, "step": 1230}, {"loss": 1.7287, "grad_norm": 0.4592697322368622, "learning_rate": 0.0002, "epoch": 2.7494456762749446, "step": 1240}, {"loss": 1.617, "grad_norm": 0.5519265532493591, "learning_rate": 0.0002, "epoch": 2.7716186252771617, "step": 1250}, {"loss": 1.6868, "grad_norm": 0.47169506549835205, "learning_rate": 0.0002, "epoch": 2.7937915742793793, "step": 1260}, {"loss": 1.658, "grad_norm": 0.47231653332710266, "learning_rate": 0.0002, "epoch": 2.8159645232815964, "step": 1270}, {"loss": 1.6738, "grad_norm": 0.49081969261169434, "learning_rate": 0.0002, "epoch": 2.8381374722838135, "step": 1280}, {"loss": 1.7248, "grad_norm": 0.4483231008052826, "learning_rate": 0.0002, "epoch": 2.860310421286031, "step": 1290}, {"loss": 1.6428, "grad_norm": 0.5310035943984985, "learning_rate": 0.0002, "epoch": 2.882483370288248, "step": 1300}, {"loss": 1.6515, "grad_norm": 0.4419795572757721, "learning_rate": 0.0002, "epoch": 2.9046563192904657, "step": 1310}, {"loss": 1.6956, "grad_norm": 0.44630762934684753, "learning_rate": 0.0002, "epoch": 2.926829268292683, "step": 1320}, {"loss": 1.6464, "grad_norm": 0.39774850010871887, "learning_rate": 0.0002, "epoch": 2.9490022172949004, "step": 1330}, {"loss": 1.7007, "grad_norm": 0.441727876663208, "learning_rate": 0.0002, "epoch": 2.9711751662971175, "step": 1340}, {"loss": 1.5968, "grad_norm": 0.43773892521858215, "learning_rate": 0.0002, "epoch": 2.9933481152993346, "step": 1350}, {"eval_loss": 1.8514760732650757, "eval_runtime": 131.9812, "eval_samples_per_second": 3.902, "eval_steps_per_second": 0.492, "epoch": 3.0, "step": 1353}, {"loss": 1.553, "grad_norm": 0.5274150371551514, "learning_rate": 0.0002, "epoch": 3.015521064301552, "step": 1360}, {"loss": 1.4784, "grad_norm": 0.5724489092826843, "learning_rate": 0.0002, "epoch": 3.0376940133037693, "step": 1370}, {"loss": 1.5365, "grad_norm": 0.6182316541671753, "learning_rate": 0.0002, "epoch": 3.059866962305987, "step": 1380}, {"loss": 1.4824, "grad_norm": 0.5709688067436218, "learning_rate": 0.0002, "epoch": 3.082039911308204, "step": 1390}, {"loss": 1.534, "grad_norm": 0.6368464231491089, "learning_rate": 0.0002, "epoch": 3.104212860310421, "step": 1400}, {"loss": 1.5191, "grad_norm": 0.5680432319641113, "learning_rate": 0.0002, "epoch": 3.1263858093126387, "step": 1410}, {"loss": 1.5258, "grad_norm": 0.5805315375328064, "learning_rate": 0.0002, "epoch": 3.1485587583148558, "step": 1420}, {"loss": 1.612, "grad_norm": 0.5782836675643921, "learning_rate": 0.0002, "epoch": 3.1707317073170733, "step": 1430}, {"loss": 1.4852, "grad_norm": 0.627159595489502, "learning_rate": 0.0002, "epoch": 3.1929046563192904, "step": 1440}, {"loss": 1.5398, "grad_norm": 0.6136751174926758, "learning_rate": 0.0002, "epoch": 3.2150776053215075, "step": 1450}, {"loss": 1.5254, "grad_norm": 0.6319093108177185, "learning_rate": 0.0002, "epoch": 3.237250554323725, "step": 1460}, {"loss": 1.5789, "grad_norm": 0.7641780972480774, "learning_rate": 0.0002, "epoch": 3.259423503325942, "step": 1470}, {"loss": 1.5514, "grad_norm": 0.6116001605987549, "learning_rate": 0.0002, "epoch": 3.2815964523281598, "step": 1480}, {"loss": 1.4647, "grad_norm": 0.6024722456932068, "learning_rate": 0.0002, "epoch": 3.303769401330377, "step": 1490}, {"loss": 1.5561, "grad_norm": 0.5941570997238159, "learning_rate": 0.0002, "epoch": 3.3259423503325944, "step": 1500}, {"loss": 1.5104, "grad_norm": 0.608369767665863, "learning_rate": 0.0002, "epoch": 3.3481152993348116, "step": 1510}, {"loss": 1.5494, "grad_norm": 0.5942065715789795, "learning_rate": 0.0002, "epoch": 3.3702882483370287, "step": 1520}, {"loss": 1.5426, "grad_norm": 0.6382330656051636, "learning_rate": 0.0002, "epoch": 3.3924611973392462, "step": 1530}, {"loss": 1.5479, "grad_norm": 0.5839648842811584, "learning_rate": 0.0002, "epoch": 3.4146341463414633, "step": 1540}, {"loss": 1.5241, "grad_norm": 0.5627358555793762, "learning_rate": 0.0002, "epoch": 3.436807095343681, "step": 1550}, {"loss": 1.5679, "grad_norm": 0.6342151761054993, "learning_rate": 0.0002, "epoch": 3.458980044345898, "step": 1560}, {"loss": 1.5005, "grad_norm": 0.6370542645454407, "learning_rate": 0.0002, "epoch": 3.481152993348115, "step": 1570}, {"loss": 1.541, "grad_norm": 0.5974680185317993, "learning_rate": 0.0002, "epoch": 3.5033259423503327, "step": 1580}, {"loss": 1.553, "grad_norm": 0.6197021007537842, "learning_rate": 0.0002, "epoch": 3.52549889135255, "step": 1590}, {"loss": 1.5287, "grad_norm": 0.6413024067878723, "learning_rate": 0.0002, "epoch": 3.5476718403547673, "step": 1600}, {"loss": 1.5301, "grad_norm": 0.5878410339355469, "learning_rate": 0.0002, "epoch": 3.5698447893569845, "step": 1610}, {"loss": 1.4625, "grad_norm": 0.6485083103179932, "learning_rate": 0.0002, "epoch": 3.5920177383592016, "step": 1620}, {"loss": 1.5373, "grad_norm": 0.5826634764671326, "learning_rate": 0.0002, "epoch": 3.614190687361419, "step": 1630}, {"loss": 1.4952, "grad_norm": 0.8906663656234741, "learning_rate": 0.0002, "epoch": 3.6363636363636362, "step": 1640}, {"loss": 1.5208, "grad_norm": 0.6288479566574097, "learning_rate": 0.0002, "epoch": 3.658536585365854, "step": 1650}, {"loss": 1.6086, "grad_norm": 0.6191049218177795, "learning_rate": 0.0002, "epoch": 3.680709534368071, "step": 1660}, {"loss": 1.5043, "grad_norm": 0.5997978448867798, "learning_rate": 0.0002, "epoch": 3.7028824833702885, "step": 1670}, {"loss": 1.5654, "grad_norm": 0.6003038287162781, "learning_rate": 0.0002, "epoch": 3.7250554323725056, "step": 1680}, {"loss": 1.4941, "grad_norm": 0.5417194962501526, "learning_rate": 0.0002, "epoch": 3.7472283813747227, "step": 1690}, {"loss": 1.5541, "grad_norm": 0.6367442607879639, "learning_rate": 0.0002, "epoch": 3.7694013303769403, "step": 1700}, {"loss": 1.5483, "grad_norm": 0.6613120436668396, "learning_rate": 0.0002, "epoch": 3.7915742793791574, "step": 1710}, {"loss": 1.5999, "grad_norm": 0.6506749391555786, "learning_rate": 0.0002, "epoch": 3.8137472283813745, "step": 1720}, {"loss": 1.5207, "grad_norm": 0.5478500723838806, "learning_rate": 0.0002, "epoch": 3.835920177383592, "step": 1730}, {"loss": 1.5619, "grad_norm": 0.7313215732574463, "learning_rate": 0.0002, "epoch": 3.858093126385809, "step": 1740}, {"loss": 1.4486, "grad_norm": 0.5453857183456421, "learning_rate": 0.0002, "epoch": 3.8802660753880267, "step": 1750}, {"loss": 1.4857, "grad_norm": 0.5983547568321228, "learning_rate": 0.0002, "epoch": 3.902439024390244, "step": 1760}, {"loss": 1.651, "grad_norm": 0.6471580266952515, "learning_rate": 0.0002, "epoch": 3.9246119733924614, "step": 1770}, {"loss": 1.461, "grad_norm": 0.5833685398101807, "learning_rate": 0.0002, "epoch": 3.9467849223946785, "step": 1780}, {"loss": 1.5014, "grad_norm": 0.5509327054023743, "learning_rate": 0.0002, "epoch": 3.9689578713968956, "step": 1790}, {"loss": 1.6225, "grad_norm": 0.6021352410316467, "learning_rate": 0.0002, "epoch": 3.991130820399113, "step": 1800}]} +{"epoch": 5.0, "step": 2255, "epoch_duration": 1176.9944846630096, "total_accumulated_duration": 6183.478788137436, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.684, "grad_norm": 0.6454975008964539, "learning_rate": 0.0002, "epoch": 0.022172949002217297, "step": 10}, {"loss": 2.2694, "grad_norm": 0.5452715158462524, "learning_rate": 0.0002, "epoch": 0.04434589800443459, "step": 20}, {"loss": 2.0796, "grad_norm": 0.5502195358276367, "learning_rate": 0.0002, "epoch": 0.06651884700665188, "step": 30}, {"loss": 1.9132, "grad_norm": 0.48551198840141296, "learning_rate": 0.0002, "epoch": 0.08869179600886919, "step": 40}, {"loss": 2.016, "grad_norm": 0.47822514176368713, "learning_rate": 0.0002, "epoch": 0.11086474501108648, "step": 50}, {"loss": 1.9455, "grad_norm": 0.5125395655632019, "learning_rate": 0.0002, "epoch": 0.13303769401330376, "step": 60}, {"loss": 1.9017, "grad_norm": 0.4600693881511688, "learning_rate": 0.0002, "epoch": 0.15521064301552107, "step": 70}, {"loss": 2.0057, "grad_norm": 0.7180814743041992, "learning_rate": 0.0002, "epoch": 0.17738359201773837, "step": 80}, {"loss": 1.8925, "grad_norm": 0.4712974429130554, "learning_rate": 0.0002, "epoch": 0.19955654101995565, "step": 90}, {"loss": 1.8386, "grad_norm": 0.4673261344432831, "learning_rate": 0.0002, "epoch": 0.22172949002217296, "step": 100}, {"loss": 1.9346, "grad_norm": 0.4129070043563843, "learning_rate": 0.0002, "epoch": 0.24390243902439024, "step": 110}, {"loss": 1.9018, "grad_norm": 0.3859104812145233, "learning_rate": 0.0002, "epoch": 0.2660753880266075, "step": 120}, {"loss": 1.8922, "grad_norm": 0.40966713428497314, "learning_rate": 0.0002, "epoch": 0.28824833702882485, "step": 130}, {"loss": 1.846, "grad_norm": 0.3685867488384247, "learning_rate": 0.0002, "epoch": 0.31042128603104213, "step": 140}, {"loss": 1.9017, "grad_norm": 0.39279988408088684, "learning_rate": 0.0002, "epoch": 0.3325942350332594, "step": 150}, {"loss": 1.8556, "grad_norm": 0.4195398986339569, "learning_rate": 0.0002, "epoch": 0.35476718403547675, "step": 160}, {"loss": 1.7883, "grad_norm": 0.469802588224411, "learning_rate": 0.0002, "epoch": 0.376940133037694, "step": 170}, {"loss": 1.8135, "grad_norm": 0.4069509208202362, "learning_rate": 0.0002, "epoch": 0.3991130820399113, "step": 180}, {"loss": 1.8429, "grad_norm": 0.47832027077674866, "learning_rate": 0.0002, "epoch": 0.4212860310421286, "step": 190}, {"loss": 1.781, "grad_norm": 0.3376411199569702, "learning_rate": 0.0002, "epoch": 0.4434589800443459, "step": 200}, {"loss": 1.8562, "grad_norm": 0.3787185847759247, "learning_rate": 0.0002, "epoch": 0.4656319290465632, "step": 210}, {"loss": 1.8352, "grad_norm": 0.40322697162628174, "learning_rate": 0.0002, "epoch": 0.4878048780487805, "step": 220}, {"loss": 1.8007, "grad_norm": 0.3710436522960663, "learning_rate": 0.0002, "epoch": 0.5099778270509978, "step": 230}, {"loss": 1.8528, "grad_norm": 0.3723200261592865, "learning_rate": 0.0002, "epoch": 0.532150776053215, "step": 240}, {"loss": 1.852, "grad_norm": 0.3457179069519043, "learning_rate": 0.0002, "epoch": 0.5543237250554324, "step": 250}, {"loss": 1.8175, "grad_norm": 0.35369473695755005, "learning_rate": 0.0002, "epoch": 0.5764966740576497, "step": 260}, {"loss": 1.7742, "grad_norm": 0.3667483329772949, "learning_rate": 0.0002, "epoch": 0.5986696230598669, "step": 270}, {"loss": 1.8152, "grad_norm": 0.4023273289203644, "learning_rate": 0.0002, "epoch": 0.6208425720620843, "step": 280}, {"loss": 1.8382, "grad_norm": 0.3601929843425751, "learning_rate": 0.0002, "epoch": 0.6430155210643016, "step": 290}, {"loss": 1.8152, "grad_norm": 0.32610392570495605, "learning_rate": 0.0002, "epoch": 0.6651884700665188, "step": 300}, {"loss": 1.8412, "grad_norm": 0.40528756380081177, "learning_rate": 0.0002, "epoch": 0.6873614190687362, "step": 310}, {"loss": 1.8528, "grad_norm": 0.34639739990234375, "learning_rate": 0.0002, "epoch": 0.7095343680709535, "step": 320}, {"loss": 1.807, "grad_norm": 0.3794991374015808, "learning_rate": 0.0002, "epoch": 0.7317073170731707, "step": 330}, {"loss": 1.8678, "grad_norm": 0.34203875064849854, "learning_rate": 0.0002, "epoch": 0.753880266075388, "step": 340}, {"loss": 1.7605, "grad_norm": 0.36692821979522705, "learning_rate": 0.0002, "epoch": 0.7760532150776053, "step": 350}, {"loss": 1.7535, "grad_norm": 0.3701125979423523, "learning_rate": 0.0002, "epoch": 0.7982261640798226, "step": 360}, {"loss": 1.7574, "grad_norm": 0.3971416652202606, "learning_rate": 0.0002, "epoch": 0.8203991130820399, "step": 370}, {"loss": 1.8476, "grad_norm": 0.3751989006996155, "learning_rate": 0.0002, "epoch": 0.8425720620842572, "step": 380}, {"loss": 1.8281, "grad_norm": 0.35116496682167053, "learning_rate": 0.0002, "epoch": 0.8647450110864745, "step": 390}, {"loss": 1.7459, "grad_norm": 0.3672674894332886, "learning_rate": 0.0002, "epoch": 0.8869179600886918, "step": 400}, {"loss": 1.8293, "grad_norm": 0.34648260474205017, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 410}, {"loss": 1.7601, "grad_norm": 0.4497389793395996, "learning_rate": 0.0002, "epoch": 0.9312638580931264, "step": 420}, {"loss": 1.8405, "grad_norm": 0.33595147728919983, "learning_rate": 0.0002, "epoch": 0.9534368070953437, "step": 430}, {"loss": 1.8137, "grad_norm": 0.3130456805229187, "learning_rate": 0.0002, "epoch": 0.975609756097561, "step": 440}, {"loss": 1.8226, "grad_norm": 0.36480239033699036, "learning_rate": 0.0002, "epoch": 0.9977827050997783, "step": 450}, {"eval_loss": 1.8323718309402466, "eval_runtime": 79.9603, "eval_samples_per_second": 6.441, "eval_steps_per_second": 0.813, "epoch": 1.0, "step": 451}, {"loss": 1.7921, "grad_norm": 0.3840029835700989, "learning_rate": 0.0002, "epoch": 1.0199556541019956, "step": 460}, {"loss": 1.7057, "grad_norm": 0.33457425236701965, "learning_rate": 0.0002, "epoch": 1.042128603104213, "step": 470}, {"loss": 1.7028, "grad_norm": 0.35766592621803284, "learning_rate": 0.0002, "epoch": 1.06430155210643, "step": 480}, {"loss": 1.7868, "grad_norm": 0.38070937991142273, "learning_rate": 0.0002, "epoch": 1.0864745011086474, "step": 490}, {"loss": 1.72, "grad_norm": 0.38546547293663025, "learning_rate": 0.0002, "epoch": 1.1086474501108647, "step": 500}, {"loss": 1.7088, "grad_norm": 0.384104460477829, "learning_rate": 0.0002, "epoch": 1.130820399113082, "step": 510}, {"loss": 1.7779, "grad_norm": 0.3556116819381714, "learning_rate": 0.0002, "epoch": 1.1529933481152994, "step": 520}, {"loss": 1.7538, "grad_norm": 0.4110541343688965, "learning_rate": 0.0002, "epoch": 1.1751662971175167, "step": 530}, {"loss": 1.7608, "grad_norm": 0.46503177285194397, "learning_rate": 0.0002, "epoch": 1.1973392461197339, "step": 540}, {"loss": 1.729, "grad_norm": 0.4366816580295563, "learning_rate": 0.0002, "epoch": 1.2195121951219512, "step": 550}, {"loss": 1.7954, "grad_norm": 0.379986047744751, "learning_rate": 0.0002, "epoch": 1.2416851441241685, "step": 560}, {"loss": 1.6852, "grad_norm": 0.3920869529247284, "learning_rate": 0.0002, "epoch": 1.2638580931263859, "step": 570}, {"loss": 1.8265, "grad_norm": 0.4013986587524414, "learning_rate": 0.0002, "epoch": 1.2860310421286032, "step": 580}, {"loss": 1.7294, "grad_norm": 0.39104390144348145, "learning_rate": 0.0002, "epoch": 1.3082039911308203, "step": 590}, {"loss": 1.7822, "grad_norm": 0.40515613555908203, "learning_rate": 0.0002, "epoch": 1.3303769401330376, "step": 600}, {"loss": 1.7614, "grad_norm": 0.4212331473827362, "learning_rate": 0.0002, "epoch": 1.352549889135255, "step": 610}, {"loss": 1.7883, "grad_norm": 0.36040815711021423, "learning_rate": 0.0002, "epoch": 1.3747228381374723, "step": 620}, {"loss": 1.7467, "grad_norm": 0.3950865864753723, "learning_rate": 0.0002, "epoch": 1.3968957871396896, "step": 630}, {"loss": 1.7242, "grad_norm": 0.3934709131717682, "learning_rate": 0.0002, "epoch": 1.4190687361419068, "step": 640}, {"loss": 1.7783, "grad_norm": 0.3905350863933563, "learning_rate": 0.0002, "epoch": 1.441241685144124, "step": 650}, {"loss": 1.7612, "grad_norm": 0.4322686493396759, "learning_rate": 0.0002, "epoch": 1.4634146341463414, "step": 660}, {"loss": 1.7357, "grad_norm": 0.35697034001350403, "learning_rate": 0.0002, "epoch": 1.4855875831485588, "step": 670}, {"loss": 1.6764, "grad_norm": 0.38570451736450195, "learning_rate": 0.0002, "epoch": 1.507760532150776, "step": 680}, {"loss": 1.7054, "grad_norm": 0.3804517090320587, "learning_rate": 0.0002, "epoch": 1.5299334811529932, "step": 690}, {"loss": 1.7725, "grad_norm": 0.4938165247440338, "learning_rate": 0.0002, "epoch": 1.5521064301552108, "step": 700}, {"loss": 1.7982, "grad_norm": 0.43075236678123474, "learning_rate": 0.0002, "epoch": 1.5742793791574279, "step": 710}, {"loss": 1.7741, "grad_norm": 0.40434643626213074, "learning_rate": 0.0002, "epoch": 1.5964523281596452, "step": 720}, {"loss": 1.7122, "grad_norm": 0.3874157667160034, "learning_rate": 0.0002, "epoch": 1.6186252771618626, "step": 730}, {"loss": 1.8141, "grad_norm": 0.3645969331264496, "learning_rate": 0.0002, "epoch": 1.6407982261640797, "step": 740}, {"loss": 1.7868, "grad_norm": 0.38588255643844604, "learning_rate": 0.0002, "epoch": 1.6629711751662972, "step": 750}, {"loss": 1.7847, "grad_norm": 0.39252519607543945, "learning_rate": 0.0002, "epoch": 1.6851441241685143, "step": 760}, {"loss": 1.7424, "grad_norm": 0.36165162920951843, "learning_rate": 0.0002, "epoch": 1.7073170731707317, "step": 770}, {"loss": 1.7172, "grad_norm": 0.36677947640419006, "learning_rate": 0.0002, "epoch": 1.729490022172949, "step": 780}, {"loss": 1.7737, "grad_norm": 0.374881774187088, "learning_rate": 0.0002, "epoch": 1.7516629711751663, "step": 790}, {"loss": 1.7331, "grad_norm": 0.4530802369117737, "learning_rate": 0.0002, "epoch": 1.7738359201773837, "step": 800}, {"loss": 1.7799, "grad_norm": 0.3879568576812744, "learning_rate": 0.0002, "epoch": 1.7960088691796008, "step": 810}, {"loss": 1.7562, "grad_norm": 0.3710079789161682, "learning_rate": 0.0002, "epoch": 1.8181818181818183, "step": 820}, {"loss": 1.7728, "grad_norm": 0.3831799030303955, "learning_rate": 0.0002, "epoch": 1.8403547671840355, "step": 830}, {"loss": 1.7605, "grad_norm": 0.3958432376384735, "learning_rate": 0.0002, "epoch": 1.8625277161862528, "step": 840}, {"loss": 1.8187, "grad_norm": 0.4129294157028198, "learning_rate": 0.0002, "epoch": 1.8847006651884701, "step": 850}, {"loss": 1.6506, "grad_norm": 0.3714745044708252, "learning_rate": 0.0002, "epoch": 1.9068736141906872, "step": 860}, {"loss": 1.707, "grad_norm": 0.40176868438720703, "learning_rate": 0.0002, "epoch": 1.9290465631929048, "step": 870}, {"loss": 1.7557, "grad_norm": 0.36937767267227173, "learning_rate": 0.0002, "epoch": 1.951219512195122, "step": 880}, {"loss": 1.7353, "grad_norm": 0.40242597460746765, "learning_rate": 0.0002, "epoch": 1.9733924611973392, "step": 890}, {"loss": 1.7402, "grad_norm": 0.3515510559082031, "learning_rate": 0.0002, "epoch": 1.9955654101995566, "step": 900}, {"eval_loss": 1.8243104219436646, "eval_runtime": 107.8856, "eval_samples_per_second": 4.774, "eval_steps_per_second": 0.602, "epoch": 2.0, "step": 902}, {"loss": 1.6631, "grad_norm": 0.4145216643810272, "learning_rate": 0.0002, "epoch": 2.0177383592017737, "step": 910}, {"loss": 1.7227, "grad_norm": 0.42423519492149353, "learning_rate": 0.0002, "epoch": 2.0399113082039912, "step": 920}, {"loss": 1.6357, "grad_norm": 0.4773229956626892, "learning_rate": 0.0002, "epoch": 2.0620842572062084, "step": 930}, {"loss": 1.671, "grad_norm": 0.4144791066646576, "learning_rate": 0.0002, "epoch": 2.084257206208426, "step": 940}, {"loss": 1.6433, "grad_norm": 0.42704132199287415, "learning_rate": 0.0002, "epoch": 2.106430155210643, "step": 950}, {"loss": 1.6767, "grad_norm": 0.4479042589664459, "learning_rate": 0.0002, "epoch": 2.12860310421286, "step": 960}, {"loss": 1.6122, "grad_norm": 0.4810638129711151, "learning_rate": 0.0002, "epoch": 2.1507760532150777, "step": 970}, {"loss": 1.6613, "grad_norm": 0.48669910430908203, "learning_rate": 0.0002, "epoch": 2.172949002217295, "step": 980}, {"loss": 1.6274, "grad_norm": 0.4252761900424957, "learning_rate": 0.0002, "epoch": 2.1951219512195124, "step": 990}, {"loss": 1.6514, "grad_norm": 0.42342790961265564, "learning_rate": 0.0002, "epoch": 2.2172949002217295, "step": 1000}, {"loss": 1.637, "grad_norm": 0.43432456254959106, "learning_rate": 0.0002, "epoch": 2.2394678492239466, "step": 1010}, {"loss": 1.6856, "grad_norm": 0.45556965470314026, "learning_rate": 0.0002, "epoch": 2.261640798226164, "step": 1020}, {"loss": 1.6554, "grad_norm": 0.48035719990730286, "learning_rate": 0.0002, "epoch": 2.2838137472283813, "step": 1030}, {"loss": 1.6947, "grad_norm": 0.4233241081237793, "learning_rate": 0.0002, "epoch": 2.305986696230599, "step": 1040}, {"loss": 1.6501, "grad_norm": 0.3918434679508209, "learning_rate": 0.0002, "epoch": 2.328159645232816, "step": 1050}, {"loss": 1.6903, "grad_norm": 0.44049757719039917, "learning_rate": 0.0002, "epoch": 2.3503325942350335, "step": 1060}, {"loss": 1.6337, "grad_norm": 0.4730056822299957, "learning_rate": 0.0002, "epoch": 2.3725055432372506, "step": 1070}, {"loss": 1.6104, "grad_norm": 0.4354589581489563, "learning_rate": 0.0002, "epoch": 2.3946784922394677, "step": 1080}, {"loss": 1.6496, "grad_norm": 0.4837590456008911, "learning_rate": 0.0002, "epoch": 2.4168514412416853, "step": 1090}, {"loss": 1.6672, "grad_norm": 0.4842571020126343, "learning_rate": 0.0002, "epoch": 2.4390243902439024, "step": 1100}, {"loss": 1.6091, "grad_norm": 0.46398279070854187, "learning_rate": 0.0002, "epoch": 2.4611973392461195, "step": 1110}, {"loss": 1.6393, "grad_norm": 0.4587327539920807, "learning_rate": 0.0002, "epoch": 2.483370288248337, "step": 1120}, {"loss": 1.621, "grad_norm": 0.4336528480052948, "learning_rate": 0.0002, "epoch": 2.505543237250554, "step": 1130}, {"loss": 1.6199, "grad_norm": 0.6162153482437134, "learning_rate": 0.0002, "epoch": 2.5277161862527717, "step": 1140}, {"loss": 1.7244, "grad_norm": 0.48175573348999023, "learning_rate": 0.0002, "epoch": 2.549889135254989, "step": 1150}, {"loss": 1.6098, "grad_norm": 0.448272705078125, "learning_rate": 0.0002, "epoch": 2.5720620842572064, "step": 1160}, {"loss": 1.6987, "grad_norm": 0.5189200639724731, "learning_rate": 0.0002, "epoch": 2.5942350332594235, "step": 1170}, {"loss": 1.6503, "grad_norm": 0.45032963156700134, "learning_rate": 0.0002, "epoch": 2.6164079822616406, "step": 1180}, {"loss": 1.6508, "grad_norm": 0.4417729377746582, "learning_rate": 0.0002, "epoch": 2.638580931263858, "step": 1190}, {"loss": 1.6084, "grad_norm": 0.5219636559486389, "learning_rate": 0.0002, "epoch": 2.6607538802660753, "step": 1200}, {"loss": 1.6121, "grad_norm": 0.47702011466026306, "learning_rate": 0.0002, "epoch": 2.682926829268293, "step": 1210}, {"loss": 1.6942, "grad_norm": 0.4328458607196808, "learning_rate": 0.0002, "epoch": 2.70509977827051, "step": 1220}, {"loss": 1.6099, "grad_norm": 0.46762076020240784, "learning_rate": 0.0002, "epoch": 2.7272727272727275, "step": 1230}, {"loss": 1.7287, "grad_norm": 0.4592697322368622, "learning_rate": 0.0002, "epoch": 2.7494456762749446, "step": 1240}, {"loss": 1.617, "grad_norm": 0.5519265532493591, "learning_rate": 0.0002, "epoch": 2.7716186252771617, "step": 1250}, {"loss": 1.6868, "grad_norm": 0.47169506549835205, "learning_rate": 0.0002, "epoch": 2.7937915742793793, "step": 1260}, {"loss": 1.658, "grad_norm": 0.47231653332710266, "learning_rate": 0.0002, "epoch": 2.8159645232815964, "step": 1270}, {"loss": 1.6738, "grad_norm": 0.49081969261169434, "learning_rate": 0.0002, "epoch": 2.8381374722838135, "step": 1280}, {"loss": 1.7248, "grad_norm": 0.4483231008052826, "learning_rate": 0.0002, "epoch": 2.860310421286031, "step": 1290}, {"loss": 1.6428, "grad_norm": 0.5310035943984985, "learning_rate": 0.0002, "epoch": 2.882483370288248, "step": 1300}, {"loss": 1.6515, "grad_norm": 0.4419795572757721, "learning_rate": 0.0002, "epoch": 2.9046563192904657, "step": 1310}, {"loss": 1.6956, "grad_norm": 0.44630762934684753, "learning_rate": 0.0002, "epoch": 2.926829268292683, "step": 1320}, {"loss": 1.6464, "grad_norm": 0.39774850010871887, "learning_rate": 0.0002, "epoch": 2.9490022172949004, "step": 1330}, {"loss": 1.7007, "grad_norm": 0.441727876663208, "learning_rate": 0.0002, "epoch": 2.9711751662971175, "step": 1340}, {"loss": 1.5968, "grad_norm": 0.43773892521858215, "learning_rate": 0.0002, "epoch": 2.9933481152993346, "step": 1350}, {"eval_loss": 1.8514760732650757, "eval_runtime": 131.9812, "eval_samples_per_second": 3.902, "eval_steps_per_second": 0.492, "epoch": 3.0, "step": 1353}, {"loss": 1.553, "grad_norm": 0.5274150371551514, "learning_rate": 0.0002, "epoch": 3.015521064301552, "step": 1360}, {"loss": 1.4784, "grad_norm": 0.5724489092826843, "learning_rate": 0.0002, "epoch": 3.0376940133037693, "step": 1370}, {"loss": 1.5365, "grad_norm": 0.6182316541671753, "learning_rate": 0.0002, "epoch": 3.059866962305987, "step": 1380}, {"loss": 1.4824, "grad_norm": 0.5709688067436218, "learning_rate": 0.0002, "epoch": 3.082039911308204, "step": 1390}, {"loss": 1.534, "grad_norm": 0.6368464231491089, "learning_rate": 0.0002, "epoch": 3.104212860310421, "step": 1400}, {"loss": 1.5191, "grad_norm": 0.5680432319641113, "learning_rate": 0.0002, "epoch": 3.1263858093126387, "step": 1410}, {"loss": 1.5258, "grad_norm": 0.5805315375328064, "learning_rate": 0.0002, "epoch": 3.1485587583148558, "step": 1420}, {"loss": 1.612, "grad_norm": 0.5782836675643921, "learning_rate": 0.0002, "epoch": 3.1707317073170733, "step": 1430}, {"loss": 1.4852, "grad_norm": 0.627159595489502, "learning_rate": 0.0002, "epoch": 3.1929046563192904, "step": 1440}, {"loss": 1.5398, "grad_norm": 0.6136751174926758, "learning_rate": 0.0002, "epoch": 3.2150776053215075, "step": 1450}, {"loss": 1.5254, "grad_norm": 0.6319093108177185, "learning_rate": 0.0002, "epoch": 3.237250554323725, "step": 1460}, {"loss": 1.5789, "grad_norm": 0.7641780972480774, "learning_rate": 0.0002, "epoch": 3.259423503325942, "step": 1470}, {"loss": 1.5514, "grad_norm": 0.6116001605987549, "learning_rate": 0.0002, "epoch": 3.2815964523281598, "step": 1480}, {"loss": 1.4647, "grad_norm": 0.6024722456932068, "learning_rate": 0.0002, "epoch": 3.303769401330377, "step": 1490}, {"loss": 1.5561, "grad_norm": 0.5941570997238159, "learning_rate": 0.0002, "epoch": 3.3259423503325944, "step": 1500}, {"loss": 1.5104, "grad_norm": 0.608369767665863, "learning_rate": 0.0002, "epoch": 3.3481152993348116, "step": 1510}, {"loss": 1.5494, "grad_norm": 0.5942065715789795, "learning_rate": 0.0002, "epoch": 3.3702882483370287, "step": 1520}, {"loss": 1.5426, "grad_norm": 0.6382330656051636, "learning_rate": 0.0002, "epoch": 3.3924611973392462, "step": 1530}, {"loss": 1.5479, "grad_norm": 0.5839648842811584, "learning_rate": 0.0002, "epoch": 3.4146341463414633, "step": 1540}, {"loss": 1.5241, "grad_norm": 0.5627358555793762, "learning_rate": 0.0002, "epoch": 3.436807095343681, "step": 1550}, {"loss": 1.5679, "grad_norm": 0.6342151761054993, "learning_rate": 0.0002, "epoch": 3.458980044345898, "step": 1560}, {"loss": 1.5005, "grad_norm": 0.6370542645454407, "learning_rate": 0.0002, "epoch": 3.481152993348115, "step": 1570}, {"loss": 1.541, "grad_norm": 0.5974680185317993, "learning_rate": 0.0002, "epoch": 3.5033259423503327, "step": 1580}, {"loss": 1.553, "grad_norm": 0.6197021007537842, "learning_rate": 0.0002, "epoch": 3.52549889135255, "step": 1590}, {"loss": 1.5287, "grad_norm": 0.6413024067878723, "learning_rate": 0.0002, "epoch": 3.5476718403547673, "step": 1600}, {"loss": 1.5301, "grad_norm": 0.5878410339355469, "learning_rate": 0.0002, "epoch": 3.5698447893569845, "step": 1610}, {"loss": 1.4625, "grad_norm": 0.6485083103179932, "learning_rate": 0.0002, "epoch": 3.5920177383592016, "step": 1620}, {"loss": 1.5373, "grad_norm": 0.5826634764671326, "learning_rate": 0.0002, "epoch": 3.614190687361419, "step": 1630}, {"loss": 1.4952, "grad_norm": 0.8906663656234741, "learning_rate": 0.0002, "epoch": 3.6363636363636362, "step": 1640}, {"loss": 1.5208, "grad_norm": 0.6288479566574097, "learning_rate": 0.0002, "epoch": 3.658536585365854, "step": 1650}, {"loss": 1.6086, "grad_norm": 0.6191049218177795, "learning_rate": 0.0002, "epoch": 3.680709534368071, "step": 1660}, {"loss": 1.5043, "grad_norm": 0.5997978448867798, "learning_rate": 0.0002, "epoch": 3.7028824833702885, "step": 1670}, {"loss": 1.5654, "grad_norm": 0.6003038287162781, "learning_rate": 0.0002, "epoch": 3.7250554323725056, "step": 1680}, {"loss": 1.4941, "grad_norm": 0.5417194962501526, "learning_rate": 0.0002, "epoch": 3.7472283813747227, "step": 1690}, {"loss": 1.5541, "grad_norm": 0.6367442607879639, "learning_rate": 0.0002, "epoch": 3.7694013303769403, "step": 1700}, {"loss": 1.5483, "grad_norm": 0.6613120436668396, "learning_rate": 0.0002, "epoch": 3.7915742793791574, "step": 1710}, {"loss": 1.5999, "grad_norm": 0.6506749391555786, "learning_rate": 0.0002, "epoch": 3.8137472283813745, "step": 1720}, {"loss": 1.5207, "grad_norm": 0.5478500723838806, "learning_rate": 0.0002, "epoch": 3.835920177383592, "step": 1730}, {"loss": 1.5619, "grad_norm": 0.7313215732574463, "learning_rate": 0.0002, "epoch": 3.858093126385809, "step": 1740}, {"loss": 1.4486, "grad_norm": 0.5453857183456421, "learning_rate": 0.0002, "epoch": 3.8802660753880267, "step": 1750}, {"loss": 1.4857, "grad_norm": 0.5983547568321228, "learning_rate": 0.0002, "epoch": 3.902439024390244, "step": 1760}, {"loss": 1.651, "grad_norm": 0.6471580266952515, "learning_rate": 0.0002, "epoch": 3.9246119733924614, "step": 1770}, {"loss": 1.461, "grad_norm": 0.5833685398101807, "learning_rate": 0.0002, "epoch": 3.9467849223946785, "step": 1780}, {"loss": 1.5014, "grad_norm": 0.5509327054023743, "learning_rate": 0.0002, "epoch": 3.9689578713968956, "step": 1790}, {"loss": 1.6225, "grad_norm": 0.6021352410316467, "learning_rate": 0.0002, "epoch": 3.991130820399113, "step": 1800}, {"eval_loss": 1.901047945022583, "eval_runtime": 82.2708, "eval_samples_per_second": 6.26, "eval_steps_per_second": 0.79, "epoch": 4.0, "step": 1804}, {"loss": 1.422, "grad_norm": 0.6232016682624817, "learning_rate": 0.0002, "epoch": 4.013303769401331, "step": 1810}, {"loss": 1.3769, "grad_norm": 0.7521207928657532, "learning_rate": 0.0002, "epoch": 4.035476718403547, "step": 1820}, {"loss": 1.4481, "grad_norm": 0.7839062213897705, "learning_rate": 0.0002, "epoch": 4.057649667405765, "step": 1830}, {"loss": 1.4147, "grad_norm": 0.8654165863990784, "learning_rate": 0.0002, "epoch": 4.0798226164079825, "step": 1840}, {"loss": 1.2983, "grad_norm": 0.6872738599777222, "learning_rate": 0.0002, "epoch": 4.101995565410199, "step": 1850}, {"loss": 1.3115, "grad_norm": 0.7529677748680115, "learning_rate": 0.0002, "epoch": 4.124168514412417, "step": 1860}, {"loss": 1.3869, "grad_norm": 0.835027277469635, "learning_rate": 0.0002, "epoch": 4.146341463414634, "step": 1870}, {"loss": 1.3273, "grad_norm": 0.7457721829414368, "learning_rate": 0.0002, "epoch": 4.168514412416852, "step": 1880}, {"loss": 1.2893, "grad_norm": 0.7366040349006653, "learning_rate": 0.0002, "epoch": 4.1906873614190685, "step": 1890}, {"loss": 1.3615, "grad_norm": 0.7802833914756775, "learning_rate": 0.0002, "epoch": 4.212860310421286, "step": 1900}, {"loss": 1.3607, "grad_norm": 0.7526614665985107, "learning_rate": 0.0002, "epoch": 4.235033259423504, "step": 1910}, {"loss": 1.4384, "grad_norm": 0.7531310319900513, "learning_rate": 0.0002, "epoch": 4.25720620842572, "step": 1920}, {"loss": 1.4074, "grad_norm": 0.8899626135826111, "learning_rate": 0.0002, "epoch": 4.279379157427938, "step": 1930}, {"loss": 1.328, "grad_norm": 0.7591356635093689, "learning_rate": 0.0002, "epoch": 4.301552106430155, "step": 1940}, {"loss": 1.4114, "grad_norm": 0.7126884460449219, "learning_rate": 0.0002, "epoch": 4.323725055432373, "step": 1950}, {"loss": 1.4259, "grad_norm": 0.7907777428627014, "learning_rate": 0.0002, "epoch": 4.34589800443459, "step": 1960}, {"loss": 1.3982, "grad_norm": 0.7854869961738586, "learning_rate": 0.0002, "epoch": 4.368070953436807, "step": 1970}, {"loss": 1.4126, "grad_norm": 0.6982123851776123, "learning_rate": 0.0002, "epoch": 4.390243902439025, "step": 1980}, {"loss": 1.3683, "grad_norm": 0.7551925182342529, "learning_rate": 0.0002, "epoch": 4.412416851441241, "step": 1990}, {"loss": 1.4551, "grad_norm": 0.864078164100647, "learning_rate": 0.0002, "epoch": 4.434589800443459, "step": 2000}, {"loss": 1.3982, "grad_norm": 0.8406776189804077, "learning_rate": 0.0002, "epoch": 4.4567627494456765, "step": 2010}, {"loss": 1.3543, "grad_norm": 0.7706766724586487, "learning_rate": 0.0002, "epoch": 4.478935698447893, "step": 2020}, {"loss": 1.386, "grad_norm": 0.7703949213027954, "learning_rate": 0.0002, "epoch": 4.501108647450111, "step": 2030}, {"loss": 1.4059, "grad_norm": 0.8654166460037231, "learning_rate": 0.0002, "epoch": 4.523281596452328, "step": 2040}, {"loss": 1.4067, "grad_norm": 0.7800114750862122, "learning_rate": 0.0002, "epoch": 4.545454545454545, "step": 2050}, {"loss": 1.3578, "grad_norm": 0.7553898692131042, "learning_rate": 0.0002, "epoch": 4.5676274944567625, "step": 2060}, {"loss": 1.3845, "grad_norm": 0.8689188957214355, "learning_rate": 0.0002, "epoch": 4.58980044345898, "step": 2070}, {"loss": 1.3851, "grad_norm": 0.7244092226028442, "learning_rate": 0.0002, "epoch": 4.611973392461198, "step": 2080}, {"loss": 1.3627, "grad_norm": 0.9829743504524231, "learning_rate": 0.0002, "epoch": 4.634146341463414, "step": 2090}, {"loss": 1.4059, "grad_norm": 0.8026102185249329, "learning_rate": 0.0002, "epoch": 4.656319290465632, "step": 2100}, {"loss": 1.3676, "grad_norm": 0.6725143194198608, "learning_rate": 0.0002, "epoch": 4.678492239467849, "step": 2110}, {"loss": 1.4669, "grad_norm": 0.8055245876312256, "learning_rate": 0.0002, "epoch": 4.700665188470067, "step": 2120}, {"loss": 1.4455, "grad_norm": 0.7507025003433228, "learning_rate": 0.0002, "epoch": 4.722838137472284, "step": 2130}, {"loss": 1.3974, "grad_norm": 0.7166216969490051, "learning_rate": 0.0002, "epoch": 4.745011086474501, "step": 2140}, {"loss": 1.33, "grad_norm": 0.6826853156089783, "learning_rate": 0.0002, "epoch": 4.767184035476719, "step": 2150}, {"loss": 1.3907, "grad_norm": 1.1347891092300415, "learning_rate": 0.0002, "epoch": 4.789356984478935, "step": 2160}, {"loss": 1.3737, "grad_norm": 0.8205971121788025, "learning_rate": 0.0002, "epoch": 4.811529933481153, "step": 2170}, {"loss": 1.3886, "grad_norm": 0.7861950397491455, "learning_rate": 0.0002, "epoch": 4.8337028824833705, "step": 2180}, {"loss": 1.4293, "grad_norm": 0.839460551738739, "learning_rate": 0.0002, "epoch": 4.855875831485587, "step": 2190}, {"loss": 1.3881, "grad_norm": 0.746583878993988, "learning_rate": 0.0002, "epoch": 4.878048780487805, "step": 2200}, {"loss": 1.4519, "grad_norm": 0.7805684804916382, "learning_rate": 0.0002, "epoch": 4.900221729490022, "step": 2210}, {"loss": 1.4053, "grad_norm": 0.8079700469970703, "learning_rate": 0.0002, "epoch": 4.922394678492239, "step": 2220}, {"loss": 1.353, "grad_norm": 0.7609502673149109, "learning_rate": 0.0002, "epoch": 4.9445676274944566, "step": 2230}, {"loss": 1.3816, "grad_norm": 0.7862996459007263, "learning_rate": 0.0002, "epoch": 4.966740576496674, "step": 2240}, {"loss": 1.4249, "grad_norm": 0.778677225112915, "learning_rate": 0.0002, "epoch": 4.988913525498892, "step": 2250}]} +{"epoch": 6.0, "step": 2706, "epoch_duration": 1554.9798872470856, "total_accumulated_duration": 7738.4586753845215, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.684, "grad_norm": 0.6454975008964539, "learning_rate": 0.0002, "epoch": 0.022172949002217297, "step": 10}, {"loss": 2.2694, "grad_norm": 0.5452715158462524, "learning_rate": 0.0002, "epoch": 0.04434589800443459, "step": 20}, {"loss": 2.0796, "grad_norm": 0.5502195358276367, "learning_rate": 0.0002, "epoch": 0.06651884700665188, "step": 30}, {"loss": 1.9132, "grad_norm": 0.48551198840141296, "learning_rate": 0.0002, "epoch": 0.08869179600886919, "step": 40}, {"loss": 2.016, "grad_norm": 0.47822514176368713, "learning_rate": 0.0002, "epoch": 0.11086474501108648, "step": 50}, {"loss": 1.9455, "grad_norm": 0.5125395655632019, "learning_rate": 0.0002, "epoch": 0.13303769401330376, "step": 60}, {"loss": 1.9017, "grad_norm": 0.4600693881511688, "learning_rate": 0.0002, "epoch": 0.15521064301552107, "step": 70}, {"loss": 2.0057, "grad_norm": 0.7180814743041992, "learning_rate": 0.0002, "epoch": 0.17738359201773837, "step": 80}, {"loss": 1.8925, "grad_norm": 0.4712974429130554, "learning_rate": 0.0002, "epoch": 0.19955654101995565, "step": 90}, {"loss": 1.8386, "grad_norm": 0.4673261344432831, "learning_rate": 0.0002, "epoch": 0.22172949002217296, "step": 100}, {"loss": 1.9346, "grad_norm": 0.4129070043563843, "learning_rate": 0.0002, "epoch": 0.24390243902439024, "step": 110}, {"loss": 1.9018, "grad_norm": 0.3859104812145233, "learning_rate": 0.0002, "epoch": 0.2660753880266075, "step": 120}, {"loss": 1.8922, "grad_norm": 0.40966713428497314, "learning_rate": 0.0002, "epoch": 0.28824833702882485, "step": 130}, {"loss": 1.846, "grad_norm": 0.3685867488384247, "learning_rate": 0.0002, "epoch": 0.31042128603104213, "step": 140}, {"loss": 1.9017, "grad_norm": 0.39279988408088684, "learning_rate": 0.0002, "epoch": 0.3325942350332594, "step": 150}, {"loss": 1.8556, "grad_norm": 0.4195398986339569, "learning_rate": 0.0002, "epoch": 0.35476718403547675, "step": 160}, {"loss": 1.7883, "grad_norm": 0.469802588224411, "learning_rate": 0.0002, "epoch": 0.376940133037694, "step": 170}, {"loss": 1.8135, "grad_norm": 0.4069509208202362, "learning_rate": 0.0002, "epoch": 0.3991130820399113, "step": 180}, {"loss": 1.8429, "grad_norm": 0.47832027077674866, "learning_rate": 0.0002, "epoch": 0.4212860310421286, "step": 190}, {"loss": 1.781, "grad_norm": 0.3376411199569702, "learning_rate": 0.0002, "epoch": 0.4434589800443459, "step": 200}, {"loss": 1.8562, "grad_norm": 0.3787185847759247, "learning_rate": 0.0002, "epoch": 0.4656319290465632, "step": 210}, {"loss": 1.8352, "grad_norm": 0.40322697162628174, "learning_rate": 0.0002, "epoch": 0.4878048780487805, "step": 220}, {"loss": 1.8007, "grad_norm": 0.3710436522960663, "learning_rate": 0.0002, "epoch": 0.5099778270509978, "step": 230}, {"loss": 1.8528, "grad_norm": 0.3723200261592865, "learning_rate": 0.0002, "epoch": 0.532150776053215, "step": 240}, {"loss": 1.852, "grad_norm": 0.3457179069519043, "learning_rate": 0.0002, "epoch": 0.5543237250554324, "step": 250}, {"loss": 1.8175, "grad_norm": 0.35369473695755005, "learning_rate": 0.0002, "epoch": 0.5764966740576497, "step": 260}, {"loss": 1.7742, "grad_norm": 0.3667483329772949, "learning_rate": 0.0002, "epoch": 0.5986696230598669, "step": 270}, {"loss": 1.8152, "grad_norm": 0.4023273289203644, "learning_rate": 0.0002, "epoch": 0.6208425720620843, "step": 280}, {"loss": 1.8382, "grad_norm": 0.3601929843425751, "learning_rate": 0.0002, "epoch": 0.6430155210643016, "step": 290}, {"loss": 1.8152, "grad_norm": 0.32610392570495605, "learning_rate": 0.0002, "epoch": 0.6651884700665188, "step": 300}, {"loss": 1.8412, "grad_norm": 0.40528756380081177, "learning_rate": 0.0002, "epoch": 0.6873614190687362, "step": 310}, {"loss": 1.8528, "grad_norm": 0.34639739990234375, "learning_rate": 0.0002, "epoch": 0.7095343680709535, "step": 320}, {"loss": 1.807, "grad_norm": 0.3794991374015808, "learning_rate": 0.0002, "epoch": 0.7317073170731707, "step": 330}, {"loss": 1.8678, "grad_norm": 0.34203875064849854, "learning_rate": 0.0002, "epoch": 0.753880266075388, "step": 340}, {"loss": 1.7605, "grad_norm": 0.36692821979522705, "learning_rate": 0.0002, "epoch": 0.7760532150776053, "step": 350}, {"loss": 1.7535, "grad_norm": 0.3701125979423523, "learning_rate": 0.0002, "epoch": 0.7982261640798226, "step": 360}, {"loss": 1.7574, "grad_norm": 0.3971416652202606, "learning_rate": 0.0002, "epoch": 0.8203991130820399, "step": 370}, {"loss": 1.8476, "grad_norm": 0.3751989006996155, "learning_rate": 0.0002, "epoch": 0.8425720620842572, "step": 380}, {"loss": 1.8281, "grad_norm": 0.35116496682167053, "learning_rate": 0.0002, "epoch": 0.8647450110864745, "step": 390}, {"loss": 1.7459, "grad_norm": 0.3672674894332886, "learning_rate": 0.0002, "epoch": 0.8869179600886918, "step": 400}, {"loss": 1.8293, "grad_norm": 0.34648260474205017, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 410}, {"loss": 1.7601, "grad_norm": 0.4497389793395996, "learning_rate": 0.0002, "epoch": 0.9312638580931264, "step": 420}, {"loss": 1.8405, "grad_norm": 0.33595147728919983, "learning_rate": 0.0002, "epoch": 0.9534368070953437, "step": 430}, {"loss": 1.8137, "grad_norm": 0.3130456805229187, "learning_rate": 0.0002, "epoch": 0.975609756097561, "step": 440}, {"loss": 1.8226, "grad_norm": 0.36480239033699036, "learning_rate": 0.0002, "epoch": 0.9977827050997783, "step": 450}, {"eval_loss": 1.8323718309402466, "eval_runtime": 79.9603, "eval_samples_per_second": 6.441, "eval_steps_per_second": 0.813, "epoch": 1.0, "step": 451}, {"loss": 1.7921, "grad_norm": 0.3840029835700989, "learning_rate": 0.0002, "epoch": 1.0199556541019956, "step": 460}, {"loss": 1.7057, "grad_norm": 0.33457425236701965, "learning_rate": 0.0002, "epoch": 1.042128603104213, "step": 470}, {"loss": 1.7028, "grad_norm": 0.35766592621803284, "learning_rate": 0.0002, "epoch": 1.06430155210643, "step": 480}, {"loss": 1.7868, "grad_norm": 0.38070937991142273, "learning_rate": 0.0002, "epoch": 1.0864745011086474, "step": 490}, {"loss": 1.72, "grad_norm": 0.38546547293663025, "learning_rate": 0.0002, "epoch": 1.1086474501108647, "step": 500}, {"loss": 1.7088, "grad_norm": 0.384104460477829, "learning_rate": 0.0002, "epoch": 1.130820399113082, "step": 510}, {"loss": 1.7779, "grad_norm": 0.3556116819381714, "learning_rate": 0.0002, "epoch": 1.1529933481152994, "step": 520}, {"loss": 1.7538, "grad_norm": 0.4110541343688965, "learning_rate": 0.0002, "epoch": 1.1751662971175167, "step": 530}, {"loss": 1.7608, "grad_norm": 0.46503177285194397, "learning_rate": 0.0002, "epoch": 1.1973392461197339, "step": 540}, {"loss": 1.729, "grad_norm": 0.4366816580295563, "learning_rate": 0.0002, "epoch": 1.2195121951219512, "step": 550}, {"loss": 1.7954, "grad_norm": 0.379986047744751, "learning_rate": 0.0002, "epoch": 1.2416851441241685, "step": 560}, {"loss": 1.6852, "grad_norm": 0.3920869529247284, "learning_rate": 0.0002, "epoch": 1.2638580931263859, "step": 570}, {"loss": 1.8265, "grad_norm": 0.4013986587524414, "learning_rate": 0.0002, "epoch": 1.2860310421286032, "step": 580}, {"loss": 1.7294, "grad_norm": 0.39104390144348145, "learning_rate": 0.0002, "epoch": 1.3082039911308203, "step": 590}, {"loss": 1.7822, "grad_norm": 0.40515613555908203, "learning_rate": 0.0002, "epoch": 1.3303769401330376, "step": 600}, {"loss": 1.7614, "grad_norm": 0.4212331473827362, "learning_rate": 0.0002, "epoch": 1.352549889135255, "step": 610}, {"loss": 1.7883, "grad_norm": 0.36040815711021423, "learning_rate": 0.0002, "epoch": 1.3747228381374723, "step": 620}, {"loss": 1.7467, "grad_norm": 0.3950865864753723, "learning_rate": 0.0002, "epoch": 1.3968957871396896, "step": 630}, {"loss": 1.7242, "grad_norm": 0.3934709131717682, "learning_rate": 0.0002, "epoch": 1.4190687361419068, "step": 640}, {"loss": 1.7783, "grad_norm": 0.3905350863933563, "learning_rate": 0.0002, "epoch": 1.441241685144124, "step": 650}, {"loss": 1.7612, "grad_norm": 0.4322686493396759, "learning_rate": 0.0002, "epoch": 1.4634146341463414, "step": 660}, {"loss": 1.7357, "grad_norm": 0.35697034001350403, "learning_rate": 0.0002, "epoch": 1.4855875831485588, "step": 670}, {"loss": 1.6764, "grad_norm": 0.38570451736450195, "learning_rate": 0.0002, "epoch": 1.507760532150776, "step": 680}, {"loss": 1.7054, "grad_norm": 0.3804517090320587, "learning_rate": 0.0002, "epoch": 1.5299334811529932, "step": 690}, {"loss": 1.7725, "grad_norm": 0.4938165247440338, "learning_rate": 0.0002, "epoch": 1.5521064301552108, "step": 700}, {"loss": 1.7982, "grad_norm": 0.43075236678123474, "learning_rate": 0.0002, "epoch": 1.5742793791574279, "step": 710}, {"loss": 1.7741, "grad_norm": 0.40434643626213074, "learning_rate": 0.0002, "epoch": 1.5964523281596452, "step": 720}, {"loss": 1.7122, "grad_norm": 0.3874157667160034, "learning_rate": 0.0002, "epoch": 1.6186252771618626, "step": 730}, {"loss": 1.8141, "grad_norm": 0.3645969331264496, "learning_rate": 0.0002, "epoch": 1.6407982261640797, "step": 740}, {"loss": 1.7868, "grad_norm": 0.38588255643844604, "learning_rate": 0.0002, "epoch": 1.6629711751662972, "step": 750}, {"loss": 1.7847, "grad_norm": 0.39252519607543945, "learning_rate": 0.0002, "epoch": 1.6851441241685143, "step": 760}, {"loss": 1.7424, "grad_norm": 0.36165162920951843, "learning_rate": 0.0002, "epoch": 1.7073170731707317, "step": 770}, {"loss": 1.7172, "grad_norm": 0.36677947640419006, "learning_rate": 0.0002, "epoch": 1.729490022172949, "step": 780}, {"loss": 1.7737, "grad_norm": 0.374881774187088, "learning_rate": 0.0002, "epoch": 1.7516629711751663, "step": 790}, {"loss": 1.7331, "grad_norm": 0.4530802369117737, "learning_rate": 0.0002, "epoch": 1.7738359201773837, "step": 800}, {"loss": 1.7799, "grad_norm": 0.3879568576812744, "learning_rate": 0.0002, "epoch": 1.7960088691796008, "step": 810}, {"loss": 1.7562, "grad_norm": 0.3710079789161682, "learning_rate": 0.0002, "epoch": 1.8181818181818183, "step": 820}, {"loss": 1.7728, "grad_norm": 0.3831799030303955, "learning_rate": 0.0002, "epoch": 1.8403547671840355, "step": 830}, {"loss": 1.7605, "grad_norm": 0.3958432376384735, "learning_rate": 0.0002, "epoch": 1.8625277161862528, "step": 840}, {"loss": 1.8187, "grad_norm": 0.4129294157028198, "learning_rate": 0.0002, "epoch": 1.8847006651884701, "step": 850}, {"loss": 1.6506, "grad_norm": 0.3714745044708252, "learning_rate": 0.0002, "epoch": 1.9068736141906872, "step": 860}, {"loss": 1.707, "grad_norm": 0.40176868438720703, "learning_rate": 0.0002, "epoch": 1.9290465631929048, "step": 870}, {"loss": 1.7557, "grad_norm": 0.36937767267227173, "learning_rate": 0.0002, "epoch": 1.951219512195122, "step": 880}, {"loss": 1.7353, "grad_norm": 0.40242597460746765, "learning_rate": 0.0002, "epoch": 1.9733924611973392, "step": 890}, {"loss": 1.7402, "grad_norm": 0.3515510559082031, "learning_rate": 0.0002, "epoch": 1.9955654101995566, "step": 900}, {"eval_loss": 1.8243104219436646, "eval_runtime": 107.8856, "eval_samples_per_second": 4.774, "eval_steps_per_second": 0.602, "epoch": 2.0, "step": 902}, {"loss": 1.6631, "grad_norm": 0.4145216643810272, "learning_rate": 0.0002, "epoch": 2.0177383592017737, "step": 910}, {"loss": 1.7227, "grad_norm": 0.42423519492149353, "learning_rate": 0.0002, "epoch": 2.0399113082039912, "step": 920}, {"loss": 1.6357, "grad_norm": 0.4773229956626892, "learning_rate": 0.0002, "epoch": 2.0620842572062084, "step": 930}, {"loss": 1.671, "grad_norm": 0.4144791066646576, "learning_rate": 0.0002, "epoch": 2.084257206208426, "step": 940}, {"loss": 1.6433, "grad_norm": 0.42704132199287415, "learning_rate": 0.0002, "epoch": 2.106430155210643, "step": 950}, {"loss": 1.6767, "grad_norm": 0.4479042589664459, "learning_rate": 0.0002, "epoch": 2.12860310421286, "step": 960}, {"loss": 1.6122, "grad_norm": 0.4810638129711151, "learning_rate": 0.0002, "epoch": 2.1507760532150777, "step": 970}, {"loss": 1.6613, "grad_norm": 0.48669910430908203, "learning_rate": 0.0002, "epoch": 2.172949002217295, "step": 980}, {"loss": 1.6274, "grad_norm": 0.4252761900424957, "learning_rate": 0.0002, "epoch": 2.1951219512195124, "step": 990}, {"loss": 1.6514, "grad_norm": 0.42342790961265564, "learning_rate": 0.0002, "epoch": 2.2172949002217295, "step": 1000}, {"loss": 1.637, "grad_norm": 0.43432456254959106, "learning_rate": 0.0002, "epoch": 2.2394678492239466, "step": 1010}, {"loss": 1.6856, "grad_norm": 0.45556965470314026, "learning_rate": 0.0002, "epoch": 2.261640798226164, "step": 1020}, {"loss": 1.6554, "grad_norm": 0.48035719990730286, "learning_rate": 0.0002, "epoch": 2.2838137472283813, "step": 1030}, {"loss": 1.6947, "grad_norm": 0.4233241081237793, "learning_rate": 0.0002, "epoch": 2.305986696230599, "step": 1040}, {"loss": 1.6501, "grad_norm": 0.3918434679508209, "learning_rate": 0.0002, "epoch": 2.328159645232816, "step": 1050}, {"loss": 1.6903, "grad_norm": 0.44049757719039917, "learning_rate": 0.0002, "epoch": 2.3503325942350335, "step": 1060}, {"loss": 1.6337, "grad_norm": 0.4730056822299957, "learning_rate": 0.0002, "epoch": 2.3725055432372506, "step": 1070}, {"loss": 1.6104, "grad_norm": 0.4354589581489563, "learning_rate": 0.0002, "epoch": 2.3946784922394677, "step": 1080}, {"loss": 1.6496, "grad_norm": 0.4837590456008911, "learning_rate": 0.0002, "epoch": 2.4168514412416853, "step": 1090}, {"loss": 1.6672, "grad_norm": 0.4842571020126343, "learning_rate": 0.0002, "epoch": 2.4390243902439024, "step": 1100}, {"loss": 1.6091, "grad_norm": 0.46398279070854187, "learning_rate": 0.0002, "epoch": 2.4611973392461195, "step": 1110}, {"loss": 1.6393, "grad_norm": 0.4587327539920807, "learning_rate": 0.0002, "epoch": 2.483370288248337, "step": 1120}, {"loss": 1.621, "grad_norm": 0.4336528480052948, "learning_rate": 0.0002, "epoch": 2.505543237250554, "step": 1130}, {"loss": 1.6199, "grad_norm": 0.6162153482437134, "learning_rate": 0.0002, "epoch": 2.5277161862527717, "step": 1140}, {"loss": 1.7244, "grad_norm": 0.48175573348999023, "learning_rate": 0.0002, "epoch": 2.549889135254989, "step": 1150}, {"loss": 1.6098, "grad_norm": 0.448272705078125, "learning_rate": 0.0002, "epoch": 2.5720620842572064, "step": 1160}, {"loss": 1.6987, "grad_norm": 0.5189200639724731, "learning_rate": 0.0002, "epoch": 2.5942350332594235, "step": 1170}, {"loss": 1.6503, "grad_norm": 0.45032963156700134, "learning_rate": 0.0002, "epoch": 2.6164079822616406, "step": 1180}, {"loss": 1.6508, "grad_norm": 0.4417729377746582, "learning_rate": 0.0002, "epoch": 2.638580931263858, "step": 1190}, {"loss": 1.6084, "grad_norm": 0.5219636559486389, "learning_rate": 0.0002, "epoch": 2.6607538802660753, "step": 1200}, {"loss": 1.6121, "grad_norm": 0.47702011466026306, "learning_rate": 0.0002, "epoch": 2.682926829268293, "step": 1210}, {"loss": 1.6942, "grad_norm": 0.4328458607196808, "learning_rate": 0.0002, "epoch": 2.70509977827051, "step": 1220}, {"loss": 1.6099, "grad_norm": 0.46762076020240784, "learning_rate": 0.0002, "epoch": 2.7272727272727275, "step": 1230}, {"loss": 1.7287, "grad_norm": 0.4592697322368622, "learning_rate": 0.0002, "epoch": 2.7494456762749446, "step": 1240}, {"loss": 1.617, "grad_norm": 0.5519265532493591, "learning_rate": 0.0002, "epoch": 2.7716186252771617, "step": 1250}, {"loss": 1.6868, "grad_norm": 0.47169506549835205, "learning_rate": 0.0002, "epoch": 2.7937915742793793, "step": 1260}, {"loss": 1.658, "grad_norm": 0.47231653332710266, "learning_rate": 0.0002, "epoch": 2.8159645232815964, "step": 1270}, {"loss": 1.6738, "grad_norm": 0.49081969261169434, "learning_rate": 0.0002, "epoch": 2.8381374722838135, "step": 1280}, {"loss": 1.7248, "grad_norm": 0.4483231008052826, "learning_rate": 0.0002, "epoch": 2.860310421286031, "step": 1290}, {"loss": 1.6428, "grad_norm": 0.5310035943984985, "learning_rate": 0.0002, "epoch": 2.882483370288248, "step": 1300}, {"loss": 1.6515, "grad_norm": 0.4419795572757721, "learning_rate": 0.0002, "epoch": 2.9046563192904657, "step": 1310}, {"loss": 1.6956, "grad_norm": 0.44630762934684753, "learning_rate": 0.0002, "epoch": 2.926829268292683, "step": 1320}, {"loss": 1.6464, "grad_norm": 0.39774850010871887, "learning_rate": 0.0002, "epoch": 2.9490022172949004, "step": 1330}, {"loss": 1.7007, "grad_norm": 0.441727876663208, "learning_rate": 0.0002, "epoch": 2.9711751662971175, "step": 1340}, {"loss": 1.5968, "grad_norm": 0.43773892521858215, "learning_rate": 0.0002, "epoch": 2.9933481152993346, "step": 1350}, {"eval_loss": 1.8514760732650757, "eval_runtime": 131.9812, "eval_samples_per_second": 3.902, "eval_steps_per_second": 0.492, "epoch": 3.0, "step": 1353}, {"loss": 1.553, "grad_norm": 0.5274150371551514, "learning_rate": 0.0002, "epoch": 3.015521064301552, "step": 1360}, {"loss": 1.4784, "grad_norm": 0.5724489092826843, "learning_rate": 0.0002, "epoch": 3.0376940133037693, "step": 1370}, {"loss": 1.5365, "grad_norm": 0.6182316541671753, "learning_rate": 0.0002, "epoch": 3.059866962305987, "step": 1380}, {"loss": 1.4824, "grad_norm": 0.5709688067436218, "learning_rate": 0.0002, "epoch": 3.082039911308204, "step": 1390}, {"loss": 1.534, "grad_norm": 0.6368464231491089, "learning_rate": 0.0002, "epoch": 3.104212860310421, "step": 1400}, {"loss": 1.5191, "grad_norm": 0.5680432319641113, "learning_rate": 0.0002, "epoch": 3.1263858093126387, "step": 1410}, {"loss": 1.5258, "grad_norm": 0.5805315375328064, "learning_rate": 0.0002, "epoch": 3.1485587583148558, "step": 1420}, {"loss": 1.612, "grad_norm": 0.5782836675643921, "learning_rate": 0.0002, "epoch": 3.1707317073170733, "step": 1430}, {"loss": 1.4852, "grad_norm": 0.627159595489502, "learning_rate": 0.0002, "epoch": 3.1929046563192904, "step": 1440}, {"loss": 1.5398, "grad_norm": 0.6136751174926758, "learning_rate": 0.0002, "epoch": 3.2150776053215075, "step": 1450}, {"loss": 1.5254, "grad_norm": 0.6319093108177185, "learning_rate": 0.0002, "epoch": 3.237250554323725, "step": 1460}, {"loss": 1.5789, "grad_norm": 0.7641780972480774, "learning_rate": 0.0002, "epoch": 3.259423503325942, "step": 1470}, {"loss": 1.5514, "grad_norm": 0.6116001605987549, "learning_rate": 0.0002, "epoch": 3.2815964523281598, "step": 1480}, {"loss": 1.4647, "grad_norm": 0.6024722456932068, "learning_rate": 0.0002, "epoch": 3.303769401330377, "step": 1490}, {"loss": 1.5561, "grad_norm": 0.5941570997238159, "learning_rate": 0.0002, "epoch": 3.3259423503325944, "step": 1500}, {"loss": 1.5104, "grad_norm": 0.608369767665863, "learning_rate": 0.0002, "epoch": 3.3481152993348116, "step": 1510}, {"loss": 1.5494, "grad_norm": 0.5942065715789795, "learning_rate": 0.0002, "epoch": 3.3702882483370287, "step": 1520}, {"loss": 1.5426, "grad_norm": 0.6382330656051636, "learning_rate": 0.0002, "epoch": 3.3924611973392462, "step": 1530}, {"loss": 1.5479, "grad_norm": 0.5839648842811584, "learning_rate": 0.0002, "epoch": 3.4146341463414633, "step": 1540}, {"loss": 1.5241, "grad_norm": 0.5627358555793762, "learning_rate": 0.0002, "epoch": 3.436807095343681, "step": 1550}, {"loss": 1.5679, "grad_norm": 0.6342151761054993, "learning_rate": 0.0002, "epoch": 3.458980044345898, "step": 1560}, {"loss": 1.5005, "grad_norm": 0.6370542645454407, "learning_rate": 0.0002, "epoch": 3.481152993348115, "step": 1570}, {"loss": 1.541, "grad_norm": 0.5974680185317993, "learning_rate": 0.0002, "epoch": 3.5033259423503327, "step": 1580}, {"loss": 1.553, "grad_norm": 0.6197021007537842, "learning_rate": 0.0002, "epoch": 3.52549889135255, "step": 1590}, {"loss": 1.5287, "grad_norm": 0.6413024067878723, "learning_rate": 0.0002, "epoch": 3.5476718403547673, "step": 1600}, {"loss": 1.5301, "grad_norm": 0.5878410339355469, "learning_rate": 0.0002, "epoch": 3.5698447893569845, "step": 1610}, {"loss": 1.4625, "grad_norm": 0.6485083103179932, "learning_rate": 0.0002, "epoch": 3.5920177383592016, "step": 1620}, {"loss": 1.5373, "grad_norm": 0.5826634764671326, "learning_rate": 0.0002, "epoch": 3.614190687361419, "step": 1630}, {"loss": 1.4952, "grad_norm": 0.8906663656234741, "learning_rate": 0.0002, "epoch": 3.6363636363636362, "step": 1640}, {"loss": 1.5208, "grad_norm": 0.6288479566574097, "learning_rate": 0.0002, "epoch": 3.658536585365854, "step": 1650}, {"loss": 1.6086, "grad_norm": 0.6191049218177795, "learning_rate": 0.0002, "epoch": 3.680709534368071, "step": 1660}, {"loss": 1.5043, "grad_norm": 0.5997978448867798, "learning_rate": 0.0002, "epoch": 3.7028824833702885, "step": 1670}, {"loss": 1.5654, "grad_norm": 0.6003038287162781, "learning_rate": 0.0002, "epoch": 3.7250554323725056, "step": 1680}, {"loss": 1.4941, "grad_norm": 0.5417194962501526, "learning_rate": 0.0002, "epoch": 3.7472283813747227, "step": 1690}, {"loss": 1.5541, "grad_norm": 0.6367442607879639, "learning_rate": 0.0002, "epoch": 3.7694013303769403, "step": 1700}, {"loss": 1.5483, "grad_norm": 0.6613120436668396, "learning_rate": 0.0002, "epoch": 3.7915742793791574, "step": 1710}, {"loss": 1.5999, "grad_norm": 0.6506749391555786, "learning_rate": 0.0002, "epoch": 3.8137472283813745, "step": 1720}, {"loss": 1.5207, "grad_norm": 0.5478500723838806, "learning_rate": 0.0002, "epoch": 3.835920177383592, "step": 1730}, {"loss": 1.5619, "grad_norm": 0.7313215732574463, "learning_rate": 0.0002, "epoch": 3.858093126385809, "step": 1740}, {"loss": 1.4486, "grad_norm": 0.5453857183456421, "learning_rate": 0.0002, "epoch": 3.8802660753880267, "step": 1750}, {"loss": 1.4857, "grad_norm": 0.5983547568321228, "learning_rate": 0.0002, "epoch": 3.902439024390244, "step": 1760}, {"loss": 1.651, "grad_norm": 0.6471580266952515, "learning_rate": 0.0002, "epoch": 3.9246119733924614, "step": 1770}, {"loss": 1.461, "grad_norm": 0.5833685398101807, "learning_rate": 0.0002, "epoch": 3.9467849223946785, "step": 1780}, {"loss": 1.5014, "grad_norm": 0.5509327054023743, "learning_rate": 0.0002, "epoch": 3.9689578713968956, "step": 1790}, {"loss": 1.6225, "grad_norm": 0.6021352410316467, "learning_rate": 0.0002, "epoch": 3.991130820399113, "step": 1800}, {"eval_loss": 1.901047945022583, "eval_runtime": 82.2708, "eval_samples_per_second": 6.26, "eval_steps_per_second": 0.79, "epoch": 4.0, "step": 1804}, {"loss": 1.422, "grad_norm": 0.6232016682624817, "learning_rate": 0.0002, "epoch": 4.013303769401331, "step": 1810}, {"loss": 1.3769, "grad_norm": 0.7521207928657532, "learning_rate": 0.0002, "epoch": 4.035476718403547, "step": 1820}, {"loss": 1.4481, "grad_norm": 0.7839062213897705, "learning_rate": 0.0002, "epoch": 4.057649667405765, "step": 1830}, {"loss": 1.4147, "grad_norm": 0.8654165863990784, "learning_rate": 0.0002, "epoch": 4.0798226164079825, "step": 1840}, {"loss": 1.2983, "grad_norm": 0.6872738599777222, "learning_rate": 0.0002, "epoch": 4.101995565410199, "step": 1850}, {"loss": 1.3115, "grad_norm": 0.7529677748680115, "learning_rate": 0.0002, "epoch": 4.124168514412417, "step": 1860}, {"loss": 1.3869, "grad_norm": 0.835027277469635, "learning_rate": 0.0002, "epoch": 4.146341463414634, "step": 1870}, {"loss": 1.3273, "grad_norm": 0.7457721829414368, "learning_rate": 0.0002, "epoch": 4.168514412416852, "step": 1880}, {"loss": 1.2893, "grad_norm": 0.7366040349006653, "learning_rate": 0.0002, "epoch": 4.1906873614190685, "step": 1890}, {"loss": 1.3615, "grad_norm": 0.7802833914756775, "learning_rate": 0.0002, "epoch": 4.212860310421286, "step": 1900}, {"loss": 1.3607, "grad_norm": 0.7526614665985107, "learning_rate": 0.0002, "epoch": 4.235033259423504, "step": 1910}, {"loss": 1.4384, "grad_norm": 0.7531310319900513, "learning_rate": 0.0002, "epoch": 4.25720620842572, "step": 1920}, {"loss": 1.4074, "grad_norm": 0.8899626135826111, "learning_rate": 0.0002, "epoch": 4.279379157427938, "step": 1930}, {"loss": 1.328, "grad_norm": 0.7591356635093689, "learning_rate": 0.0002, "epoch": 4.301552106430155, "step": 1940}, {"loss": 1.4114, "grad_norm": 0.7126884460449219, "learning_rate": 0.0002, "epoch": 4.323725055432373, "step": 1950}, {"loss": 1.4259, "grad_norm": 0.7907777428627014, "learning_rate": 0.0002, "epoch": 4.34589800443459, "step": 1960}, {"loss": 1.3982, "grad_norm": 0.7854869961738586, "learning_rate": 0.0002, "epoch": 4.368070953436807, "step": 1970}, {"loss": 1.4126, "grad_norm": 0.6982123851776123, "learning_rate": 0.0002, "epoch": 4.390243902439025, "step": 1980}, {"loss": 1.3683, "grad_norm": 0.7551925182342529, "learning_rate": 0.0002, "epoch": 4.412416851441241, "step": 1990}, {"loss": 1.4551, "grad_norm": 0.864078164100647, "learning_rate": 0.0002, "epoch": 4.434589800443459, "step": 2000}, {"loss": 1.3982, "grad_norm": 0.8406776189804077, "learning_rate": 0.0002, "epoch": 4.4567627494456765, "step": 2010}, {"loss": 1.3543, "grad_norm": 0.7706766724586487, "learning_rate": 0.0002, "epoch": 4.478935698447893, "step": 2020}, {"loss": 1.386, "grad_norm": 0.7703949213027954, "learning_rate": 0.0002, "epoch": 4.501108647450111, "step": 2030}, {"loss": 1.4059, "grad_norm": 0.8654166460037231, "learning_rate": 0.0002, "epoch": 4.523281596452328, "step": 2040}, {"loss": 1.4067, "grad_norm": 0.7800114750862122, "learning_rate": 0.0002, "epoch": 4.545454545454545, "step": 2050}, {"loss": 1.3578, "grad_norm": 0.7553898692131042, "learning_rate": 0.0002, "epoch": 4.5676274944567625, "step": 2060}, {"loss": 1.3845, "grad_norm": 0.8689188957214355, "learning_rate": 0.0002, "epoch": 4.58980044345898, "step": 2070}, {"loss": 1.3851, "grad_norm": 0.7244092226028442, "learning_rate": 0.0002, "epoch": 4.611973392461198, "step": 2080}, {"loss": 1.3627, "grad_norm": 0.9829743504524231, "learning_rate": 0.0002, "epoch": 4.634146341463414, "step": 2090}, {"loss": 1.4059, "grad_norm": 0.8026102185249329, "learning_rate": 0.0002, "epoch": 4.656319290465632, "step": 2100}, {"loss": 1.3676, "grad_norm": 0.6725143194198608, "learning_rate": 0.0002, "epoch": 4.678492239467849, "step": 2110}, {"loss": 1.4669, "grad_norm": 0.8055245876312256, "learning_rate": 0.0002, "epoch": 4.700665188470067, "step": 2120}, {"loss": 1.4455, "grad_norm": 0.7507025003433228, "learning_rate": 0.0002, "epoch": 4.722838137472284, "step": 2130}, {"loss": 1.3974, "grad_norm": 0.7166216969490051, "learning_rate": 0.0002, "epoch": 4.745011086474501, "step": 2140}, {"loss": 1.33, "grad_norm": 0.6826853156089783, "learning_rate": 0.0002, "epoch": 4.767184035476719, "step": 2150}, {"loss": 1.3907, "grad_norm": 1.1347891092300415, "learning_rate": 0.0002, "epoch": 4.789356984478935, "step": 2160}, {"loss": 1.3737, "grad_norm": 0.8205971121788025, "learning_rate": 0.0002, "epoch": 4.811529933481153, "step": 2170}, {"loss": 1.3886, "grad_norm": 0.7861950397491455, "learning_rate": 0.0002, "epoch": 4.8337028824833705, "step": 2180}, {"loss": 1.4293, "grad_norm": 0.839460551738739, "learning_rate": 0.0002, "epoch": 4.855875831485587, "step": 2190}, {"loss": 1.3881, "grad_norm": 0.746583878993988, "learning_rate": 0.0002, "epoch": 4.878048780487805, "step": 2200}, {"loss": 1.4519, "grad_norm": 0.7805684804916382, "learning_rate": 0.0002, "epoch": 4.900221729490022, "step": 2210}, {"loss": 1.4053, "grad_norm": 0.8079700469970703, "learning_rate": 0.0002, "epoch": 4.922394678492239, "step": 2220}, {"loss": 1.353, "grad_norm": 0.7609502673149109, "learning_rate": 0.0002, "epoch": 4.9445676274944566, "step": 2230}, {"loss": 1.3816, "grad_norm": 0.7862996459007263, "learning_rate": 0.0002, "epoch": 4.966740576496674, "step": 2240}, {"loss": 1.4249, "grad_norm": 0.778677225112915, "learning_rate": 0.0002, "epoch": 4.988913525498892, "step": 2250}, {"eval_loss": 1.9658271074295044, "eval_runtime": 108.3717, "eval_samples_per_second": 4.752, "eval_steps_per_second": 0.6, "epoch": 5.0, "step": 2255}, {"loss": 1.3395, "grad_norm": 0.7520418167114258, "learning_rate": 0.0002, "epoch": 5.011086474501108, "step": 2260}, {"loss": 1.1909, "grad_norm": 1.1831114292144775, "learning_rate": 0.0002, "epoch": 5.033259423503326, "step": 2270}, {"loss": 1.1784, "grad_norm": 0.8718661069869995, "learning_rate": 0.0002, "epoch": 5.0554323725055434, "step": 2280}, {"loss": 1.2208, "grad_norm": 1.0186705589294434, "learning_rate": 0.0002, "epoch": 5.07760532150776, "step": 2290}, {"loss": 1.2259, "grad_norm": 1.0370045900344849, "learning_rate": 0.0002, "epoch": 5.099778270509978, "step": 2300}, {"loss": 1.1485, "grad_norm": 0.9448253512382507, "learning_rate": 0.0002, "epoch": 5.121951219512195, "step": 2310}, {"loss": 1.1764, "grad_norm": 0.988973081111908, "learning_rate": 0.0002, "epoch": 5.144124168514413, "step": 2320}, {"loss": 1.1544, "grad_norm": 0.9368142485618591, "learning_rate": 0.0002, "epoch": 5.1662971175166295, "step": 2330}, {"loss": 1.2416, "grad_norm": 1.0289298295974731, "learning_rate": 0.0002, "epoch": 5.188470066518847, "step": 2340}, {"loss": 1.1982, "grad_norm": 0.9611420035362244, "learning_rate": 0.0002, "epoch": 5.210643015521065, "step": 2350}, {"loss": 1.2046, "grad_norm": 0.8490312099456787, "learning_rate": 0.0002, "epoch": 5.232815964523281, "step": 2360}, {"loss": 1.2504, "grad_norm": 1.0165891647338867, "learning_rate": 0.0002, "epoch": 5.254988913525499, "step": 2370}, {"loss": 1.2749, "grad_norm": 0.9902606010437012, "learning_rate": 0.0002, "epoch": 5.277161862527716, "step": 2380}, {"loss": 1.2186, "grad_norm": 0.987205445766449, "learning_rate": 0.0002, "epoch": 5.299334811529933, "step": 2390}, {"loss": 1.1962, "grad_norm": 0.7931132316589355, "learning_rate": 0.0002, "epoch": 5.321507760532151, "step": 2400}, {"loss": 1.1661, "grad_norm": 1.143110990524292, "learning_rate": 0.0002, "epoch": 5.343680709534368, "step": 2410}, {"loss": 1.191, "grad_norm": 0.9869807362556458, "learning_rate": 0.0002, "epoch": 5.365853658536586, "step": 2420}, {"loss": 1.208, "grad_norm": 0.9835564494132996, "learning_rate": 0.0002, "epoch": 5.388026607538802, "step": 2430}, {"loss": 1.2734, "grad_norm": 0.8321971893310547, "learning_rate": 0.0002, "epoch": 5.41019955654102, "step": 2440}, {"loss": 1.2308, "grad_norm": 0.8379601240158081, "learning_rate": 0.0002, "epoch": 5.4323725055432375, "step": 2450}, {"loss": 1.2464, "grad_norm": 0.9872745871543884, "learning_rate": 0.0002, "epoch": 5.454545454545454, "step": 2460}, {"loss": 1.283, "grad_norm": 0.9455783367156982, "learning_rate": 0.0002, "epoch": 5.476718403547672, "step": 2470}, {"loss": 1.2585, "grad_norm": 0.9594705700874329, "learning_rate": 0.0002, "epoch": 5.498891352549889, "step": 2480}, {"loss": 1.2776, "grad_norm": 1.036603331565857, "learning_rate": 0.0002, "epoch": 5.521064301552107, "step": 2490}, {"loss": 1.2346, "grad_norm": 1.0329008102416992, "learning_rate": 0.0002, "epoch": 5.5432372505543235, "step": 2500}, {"loss": 1.2202, "grad_norm": 0.90513014793396, "learning_rate": 0.0002, "epoch": 5.565410199556541, "step": 2510}, {"loss": 1.2977, "grad_norm": 1.107680320739746, "learning_rate": 0.0002, "epoch": 5.587583148558759, "step": 2520}, {"loss": 1.2117, "grad_norm": 0.8842377662658691, "learning_rate": 0.0002, "epoch": 5.609756097560975, "step": 2530}, {"loss": 1.2448, "grad_norm": 0.9856716990470886, "learning_rate": 0.0002, "epoch": 5.631929046563193, "step": 2540}, {"loss": 1.2579, "grad_norm": 1.0363198518753052, "learning_rate": 0.0002, "epoch": 5.65410199556541, "step": 2550}, {"loss": 1.236, "grad_norm": 0.9366242289543152, "learning_rate": 0.0002, "epoch": 5.676274944567627, "step": 2560}, {"loss": 1.2652, "grad_norm": 0.9180609583854675, "learning_rate": 0.0002, "epoch": 5.698447893569845, "step": 2570}, {"loss": 1.2153, "grad_norm": 0.96494460105896, "learning_rate": 0.0002, "epoch": 5.720620842572062, "step": 2580}, {"loss": 1.2596, "grad_norm": 1.066856861114502, "learning_rate": 0.0002, "epoch": 5.74279379157428, "step": 2590}, {"loss": 1.2437, "grad_norm": 1.0576446056365967, "learning_rate": 0.0002, "epoch": 5.764966740576496, "step": 2600}, {"loss": 1.2449, "grad_norm": 1.0688375234603882, "learning_rate": 0.0002, "epoch": 5.787139689578714, "step": 2610}, {"loss": 1.2094, "grad_norm": 0.9294432401657104, "learning_rate": 0.0002, "epoch": 5.8093126385809315, "step": 2620}, {"loss": 1.3705, "grad_norm": 0.9467836618423462, "learning_rate": 0.0002, "epoch": 5.831485587583149, "step": 2630}, {"loss": 1.334, "grad_norm": 1.1947448253631592, "learning_rate": 0.0002, "epoch": 5.853658536585366, "step": 2640}, {"loss": 1.1952, "grad_norm": 0.9225861430168152, "learning_rate": 0.0002, "epoch": 5.875831485587583, "step": 2650}, {"loss": 1.3356, "grad_norm": 0.9499539136886597, "learning_rate": 0.0002, "epoch": 5.898004434589801, "step": 2660}, {"loss": 1.2898, "grad_norm": 0.9666298031806946, "learning_rate": 0.0002, "epoch": 5.9201773835920175, "step": 2670}, {"loss": 1.1846, "grad_norm": 1.0549718141555786, "learning_rate": 0.0002, "epoch": 5.942350332594235, "step": 2680}, {"loss": 1.2132, "grad_norm": 1.1662505865097046, "learning_rate": 0.0002, "epoch": 5.964523281596453, "step": 2690}, {"loss": 1.2717, "grad_norm": 0.9200838208198547, "learning_rate": 0.0002, "epoch": 5.986696230598669, "step": 2700}]} +{"epoch": 7.0, "step": 3157, "epoch_duration": 1759.271271944046, "total_accumulated_duration": 9497.729947328568, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.684, "grad_norm": 0.6454975008964539, "learning_rate": 0.0002, "epoch": 0.022172949002217297, "step": 10}, {"loss": 2.2694, "grad_norm": 0.5452715158462524, "learning_rate": 0.0002, "epoch": 0.04434589800443459, "step": 20}, {"loss": 2.0796, "grad_norm": 0.5502195358276367, "learning_rate": 0.0002, "epoch": 0.06651884700665188, "step": 30}, {"loss": 1.9132, "grad_norm": 0.48551198840141296, "learning_rate": 0.0002, "epoch": 0.08869179600886919, "step": 40}, {"loss": 2.016, "grad_norm": 0.47822514176368713, "learning_rate": 0.0002, "epoch": 0.11086474501108648, "step": 50}, {"loss": 1.9455, "grad_norm": 0.5125395655632019, "learning_rate": 0.0002, "epoch": 0.13303769401330376, "step": 60}, {"loss": 1.9017, "grad_norm": 0.4600693881511688, "learning_rate": 0.0002, "epoch": 0.15521064301552107, "step": 70}, {"loss": 2.0057, "grad_norm": 0.7180814743041992, "learning_rate": 0.0002, "epoch": 0.17738359201773837, "step": 80}, {"loss": 1.8925, "grad_norm": 0.4712974429130554, "learning_rate": 0.0002, "epoch": 0.19955654101995565, "step": 90}, {"loss": 1.8386, "grad_norm": 0.4673261344432831, "learning_rate": 0.0002, "epoch": 0.22172949002217296, "step": 100}, {"loss": 1.9346, "grad_norm": 0.4129070043563843, "learning_rate": 0.0002, "epoch": 0.24390243902439024, "step": 110}, {"loss": 1.9018, "grad_norm": 0.3859104812145233, "learning_rate": 0.0002, "epoch": 0.2660753880266075, "step": 120}, {"loss": 1.8922, "grad_norm": 0.40966713428497314, "learning_rate": 0.0002, "epoch": 0.28824833702882485, "step": 130}, {"loss": 1.846, "grad_norm": 0.3685867488384247, "learning_rate": 0.0002, "epoch": 0.31042128603104213, "step": 140}, {"loss": 1.9017, "grad_norm": 0.39279988408088684, "learning_rate": 0.0002, "epoch": 0.3325942350332594, "step": 150}, {"loss": 1.8556, "grad_norm": 0.4195398986339569, "learning_rate": 0.0002, "epoch": 0.35476718403547675, "step": 160}, {"loss": 1.7883, "grad_norm": 0.469802588224411, "learning_rate": 0.0002, "epoch": 0.376940133037694, "step": 170}, {"loss": 1.8135, "grad_norm": 0.4069509208202362, "learning_rate": 0.0002, "epoch": 0.3991130820399113, "step": 180}, {"loss": 1.8429, "grad_norm": 0.47832027077674866, "learning_rate": 0.0002, "epoch": 0.4212860310421286, "step": 190}, {"loss": 1.781, "grad_norm": 0.3376411199569702, "learning_rate": 0.0002, "epoch": 0.4434589800443459, "step": 200}, {"loss": 1.8562, "grad_norm": 0.3787185847759247, "learning_rate": 0.0002, "epoch": 0.4656319290465632, "step": 210}, {"loss": 1.8352, "grad_norm": 0.40322697162628174, "learning_rate": 0.0002, "epoch": 0.4878048780487805, "step": 220}, {"loss": 1.8007, "grad_norm": 0.3710436522960663, "learning_rate": 0.0002, "epoch": 0.5099778270509978, "step": 230}, {"loss": 1.8528, "grad_norm": 0.3723200261592865, "learning_rate": 0.0002, "epoch": 0.532150776053215, "step": 240}, {"loss": 1.852, "grad_norm": 0.3457179069519043, "learning_rate": 0.0002, "epoch": 0.5543237250554324, "step": 250}, {"loss": 1.8175, "grad_norm": 0.35369473695755005, "learning_rate": 0.0002, "epoch": 0.5764966740576497, "step": 260}, {"loss": 1.7742, "grad_norm": 0.3667483329772949, "learning_rate": 0.0002, "epoch": 0.5986696230598669, "step": 270}, {"loss": 1.8152, "grad_norm": 0.4023273289203644, "learning_rate": 0.0002, "epoch": 0.6208425720620843, "step": 280}, {"loss": 1.8382, "grad_norm": 0.3601929843425751, "learning_rate": 0.0002, "epoch": 0.6430155210643016, "step": 290}, {"loss": 1.8152, "grad_norm": 0.32610392570495605, "learning_rate": 0.0002, "epoch": 0.6651884700665188, "step": 300}, {"loss": 1.8412, "grad_norm": 0.40528756380081177, "learning_rate": 0.0002, "epoch": 0.6873614190687362, "step": 310}, {"loss": 1.8528, "grad_norm": 0.34639739990234375, "learning_rate": 0.0002, "epoch": 0.7095343680709535, "step": 320}, {"loss": 1.807, "grad_norm": 0.3794991374015808, "learning_rate": 0.0002, "epoch": 0.7317073170731707, "step": 330}, {"loss": 1.8678, "grad_norm": 0.34203875064849854, "learning_rate": 0.0002, "epoch": 0.753880266075388, "step": 340}, {"loss": 1.7605, "grad_norm": 0.36692821979522705, "learning_rate": 0.0002, "epoch": 0.7760532150776053, "step": 350}, {"loss": 1.7535, "grad_norm": 0.3701125979423523, "learning_rate": 0.0002, "epoch": 0.7982261640798226, "step": 360}, {"loss": 1.7574, "grad_norm": 0.3971416652202606, "learning_rate": 0.0002, "epoch": 0.8203991130820399, "step": 370}, {"loss": 1.8476, "grad_norm": 0.3751989006996155, "learning_rate": 0.0002, "epoch": 0.8425720620842572, "step": 380}, {"loss": 1.8281, "grad_norm": 0.35116496682167053, "learning_rate": 0.0002, "epoch": 0.8647450110864745, "step": 390}, {"loss": 1.7459, "grad_norm": 0.3672674894332886, "learning_rate": 0.0002, "epoch": 0.8869179600886918, "step": 400}, {"loss": 1.8293, "grad_norm": 0.34648260474205017, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 410}, {"loss": 1.7601, "grad_norm": 0.4497389793395996, "learning_rate": 0.0002, "epoch": 0.9312638580931264, "step": 420}, {"loss": 1.8405, "grad_norm": 0.33595147728919983, "learning_rate": 0.0002, "epoch": 0.9534368070953437, "step": 430}, {"loss": 1.8137, "grad_norm": 0.3130456805229187, "learning_rate": 0.0002, "epoch": 0.975609756097561, "step": 440}, {"loss": 1.8226, "grad_norm": 0.36480239033699036, "learning_rate": 0.0002, "epoch": 0.9977827050997783, "step": 450}, {"eval_loss": 1.8323718309402466, "eval_runtime": 79.9603, "eval_samples_per_second": 6.441, "eval_steps_per_second": 0.813, "epoch": 1.0, "step": 451}, {"loss": 1.7921, "grad_norm": 0.3840029835700989, "learning_rate": 0.0002, "epoch": 1.0199556541019956, "step": 460}, {"loss": 1.7057, "grad_norm": 0.33457425236701965, "learning_rate": 0.0002, "epoch": 1.042128603104213, "step": 470}, {"loss": 1.7028, "grad_norm": 0.35766592621803284, "learning_rate": 0.0002, "epoch": 1.06430155210643, "step": 480}, {"loss": 1.7868, "grad_norm": 0.38070937991142273, "learning_rate": 0.0002, "epoch": 1.0864745011086474, "step": 490}, {"loss": 1.72, "grad_norm": 0.38546547293663025, "learning_rate": 0.0002, "epoch": 1.1086474501108647, "step": 500}, {"loss": 1.7088, "grad_norm": 0.384104460477829, "learning_rate": 0.0002, "epoch": 1.130820399113082, "step": 510}, {"loss": 1.7779, "grad_norm": 0.3556116819381714, "learning_rate": 0.0002, "epoch": 1.1529933481152994, "step": 520}, {"loss": 1.7538, "grad_norm": 0.4110541343688965, "learning_rate": 0.0002, "epoch": 1.1751662971175167, "step": 530}, {"loss": 1.7608, "grad_norm": 0.46503177285194397, "learning_rate": 0.0002, "epoch": 1.1973392461197339, "step": 540}, {"loss": 1.729, "grad_norm": 0.4366816580295563, "learning_rate": 0.0002, "epoch": 1.2195121951219512, "step": 550}, {"loss": 1.7954, "grad_norm": 0.379986047744751, "learning_rate": 0.0002, "epoch": 1.2416851441241685, "step": 560}, {"loss": 1.6852, "grad_norm": 0.3920869529247284, "learning_rate": 0.0002, "epoch": 1.2638580931263859, "step": 570}, {"loss": 1.8265, "grad_norm": 0.4013986587524414, "learning_rate": 0.0002, "epoch": 1.2860310421286032, "step": 580}, {"loss": 1.7294, "grad_norm": 0.39104390144348145, "learning_rate": 0.0002, "epoch": 1.3082039911308203, "step": 590}, {"loss": 1.7822, "grad_norm": 0.40515613555908203, "learning_rate": 0.0002, "epoch": 1.3303769401330376, "step": 600}, {"loss": 1.7614, "grad_norm": 0.4212331473827362, "learning_rate": 0.0002, "epoch": 1.352549889135255, "step": 610}, {"loss": 1.7883, "grad_norm": 0.36040815711021423, "learning_rate": 0.0002, "epoch": 1.3747228381374723, "step": 620}, {"loss": 1.7467, "grad_norm": 0.3950865864753723, "learning_rate": 0.0002, "epoch": 1.3968957871396896, "step": 630}, {"loss": 1.7242, "grad_norm": 0.3934709131717682, "learning_rate": 0.0002, "epoch": 1.4190687361419068, "step": 640}, {"loss": 1.7783, "grad_norm": 0.3905350863933563, "learning_rate": 0.0002, "epoch": 1.441241685144124, "step": 650}, {"loss": 1.7612, "grad_norm": 0.4322686493396759, "learning_rate": 0.0002, "epoch": 1.4634146341463414, "step": 660}, {"loss": 1.7357, "grad_norm": 0.35697034001350403, "learning_rate": 0.0002, "epoch": 1.4855875831485588, "step": 670}, {"loss": 1.6764, "grad_norm": 0.38570451736450195, "learning_rate": 0.0002, "epoch": 1.507760532150776, "step": 680}, {"loss": 1.7054, "grad_norm": 0.3804517090320587, "learning_rate": 0.0002, "epoch": 1.5299334811529932, "step": 690}, {"loss": 1.7725, "grad_norm": 0.4938165247440338, "learning_rate": 0.0002, "epoch": 1.5521064301552108, "step": 700}, {"loss": 1.7982, "grad_norm": 0.43075236678123474, "learning_rate": 0.0002, "epoch": 1.5742793791574279, "step": 710}, {"loss": 1.7741, "grad_norm": 0.40434643626213074, "learning_rate": 0.0002, "epoch": 1.5964523281596452, "step": 720}, {"loss": 1.7122, "grad_norm": 0.3874157667160034, "learning_rate": 0.0002, "epoch": 1.6186252771618626, "step": 730}, {"loss": 1.8141, "grad_norm": 0.3645969331264496, "learning_rate": 0.0002, "epoch": 1.6407982261640797, "step": 740}, {"loss": 1.7868, "grad_norm": 0.38588255643844604, "learning_rate": 0.0002, "epoch": 1.6629711751662972, "step": 750}, {"loss": 1.7847, "grad_norm": 0.39252519607543945, "learning_rate": 0.0002, "epoch": 1.6851441241685143, "step": 760}, {"loss": 1.7424, "grad_norm": 0.36165162920951843, "learning_rate": 0.0002, "epoch": 1.7073170731707317, "step": 770}, {"loss": 1.7172, "grad_norm": 0.36677947640419006, "learning_rate": 0.0002, "epoch": 1.729490022172949, "step": 780}, {"loss": 1.7737, "grad_norm": 0.374881774187088, "learning_rate": 0.0002, "epoch": 1.7516629711751663, "step": 790}, {"loss": 1.7331, "grad_norm": 0.4530802369117737, "learning_rate": 0.0002, "epoch": 1.7738359201773837, "step": 800}, {"loss": 1.7799, "grad_norm": 0.3879568576812744, "learning_rate": 0.0002, "epoch": 1.7960088691796008, "step": 810}, {"loss": 1.7562, "grad_norm": 0.3710079789161682, "learning_rate": 0.0002, "epoch": 1.8181818181818183, "step": 820}, {"loss": 1.7728, "grad_norm": 0.3831799030303955, "learning_rate": 0.0002, "epoch": 1.8403547671840355, "step": 830}, {"loss": 1.7605, "grad_norm": 0.3958432376384735, "learning_rate": 0.0002, "epoch": 1.8625277161862528, "step": 840}, {"loss": 1.8187, "grad_norm": 0.4129294157028198, "learning_rate": 0.0002, "epoch": 1.8847006651884701, "step": 850}, {"loss": 1.6506, "grad_norm": 0.3714745044708252, "learning_rate": 0.0002, "epoch": 1.9068736141906872, "step": 860}, {"loss": 1.707, "grad_norm": 0.40176868438720703, "learning_rate": 0.0002, "epoch": 1.9290465631929048, "step": 870}, {"loss": 1.7557, "grad_norm": 0.36937767267227173, "learning_rate": 0.0002, "epoch": 1.951219512195122, "step": 880}, {"loss": 1.7353, "grad_norm": 0.40242597460746765, "learning_rate": 0.0002, "epoch": 1.9733924611973392, "step": 890}, {"loss": 1.7402, "grad_norm": 0.3515510559082031, "learning_rate": 0.0002, "epoch": 1.9955654101995566, "step": 900}, {"eval_loss": 1.8243104219436646, "eval_runtime": 107.8856, "eval_samples_per_second": 4.774, "eval_steps_per_second": 0.602, "epoch": 2.0, "step": 902}, {"loss": 1.6631, "grad_norm": 0.4145216643810272, "learning_rate": 0.0002, "epoch": 2.0177383592017737, "step": 910}, {"loss": 1.7227, "grad_norm": 0.42423519492149353, "learning_rate": 0.0002, "epoch": 2.0399113082039912, "step": 920}, {"loss": 1.6357, "grad_norm": 0.4773229956626892, "learning_rate": 0.0002, "epoch": 2.0620842572062084, "step": 930}, {"loss": 1.671, "grad_norm": 0.4144791066646576, "learning_rate": 0.0002, "epoch": 2.084257206208426, "step": 940}, {"loss": 1.6433, "grad_norm": 0.42704132199287415, "learning_rate": 0.0002, "epoch": 2.106430155210643, "step": 950}, {"loss": 1.6767, "grad_norm": 0.4479042589664459, "learning_rate": 0.0002, "epoch": 2.12860310421286, "step": 960}, {"loss": 1.6122, "grad_norm": 0.4810638129711151, "learning_rate": 0.0002, "epoch": 2.1507760532150777, "step": 970}, {"loss": 1.6613, "grad_norm": 0.48669910430908203, "learning_rate": 0.0002, "epoch": 2.172949002217295, "step": 980}, {"loss": 1.6274, "grad_norm": 0.4252761900424957, "learning_rate": 0.0002, "epoch": 2.1951219512195124, "step": 990}, {"loss": 1.6514, "grad_norm": 0.42342790961265564, "learning_rate": 0.0002, "epoch": 2.2172949002217295, "step": 1000}, {"loss": 1.637, "grad_norm": 0.43432456254959106, "learning_rate": 0.0002, "epoch": 2.2394678492239466, "step": 1010}, {"loss": 1.6856, "grad_norm": 0.45556965470314026, "learning_rate": 0.0002, "epoch": 2.261640798226164, "step": 1020}, {"loss": 1.6554, "grad_norm": 0.48035719990730286, "learning_rate": 0.0002, "epoch": 2.2838137472283813, "step": 1030}, {"loss": 1.6947, "grad_norm": 0.4233241081237793, "learning_rate": 0.0002, "epoch": 2.305986696230599, "step": 1040}, {"loss": 1.6501, "grad_norm": 0.3918434679508209, "learning_rate": 0.0002, "epoch": 2.328159645232816, "step": 1050}, {"loss": 1.6903, "grad_norm": 0.44049757719039917, "learning_rate": 0.0002, "epoch": 2.3503325942350335, "step": 1060}, {"loss": 1.6337, "grad_norm": 0.4730056822299957, "learning_rate": 0.0002, "epoch": 2.3725055432372506, "step": 1070}, {"loss": 1.6104, "grad_norm": 0.4354589581489563, "learning_rate": 0.0002, "epoch": 2.3946784922394677, "step": 1080}, {"loss": 1.6496, "grad_norm": 0.4837590456008911, "learning_rate": 0.0002, "epoch": 2.4168514412416853, "step": 1090}, {"loss": 1.6672, "grad_norm": 0.4842571020126343, "learning_rate": 0.0002, "epoch": 2.4390243902439024, "step": 1100}, {"loss": 1.6091, "grad_norm": 0.46398279070854187, "learning_rate": 0.0002, "epoch": 2.4611973392461195, "step": 1110}, {"loss": 1.6393, "grad_norm": 0.4587327539920807, "learning_rate": 0.0002, "epoch": 2.483370288248337, "step": 1120}, {"loss": 1.621, "grad_norm": 0.4336528480052948, "learning_rate": 0.0002, "epoch": 2.505543237250554, "step": 1130}, {"loss": 1.6199, "grad_norm": 0.6162153482437134, "learning_rate": 0.0002, "epoch": 2.5277161862527717, "step": 1140}, {"loss": 1.7244, "grad_norm": 0.48175573348999023, "learning_rate": 0.0002, "epoch": 2.549889135254989, "step": 1150}, {"loss": 1.6098, "grad_norm": 0.448272705078125, "learning_rate": 0.0002, "epoch": 2.5720620842572064, "step": 1160}, {"loss": 1.6987, "grad_norm": 0.5189200639724731, "learning_rate": 0.0002, "epoch": 2.5942350332594235, "step": 1170}, {"loss": 1.6503, "grad_norm": 0.45032963156700134, "learning_rate": 0.0002, "epoch": 2.6164079822616406, "step": 1180}, {"loss": 1.6508, "grad_norm": 0.4417729377746582, "learning_rate": 0.0002, "epoch": 2.638580931263858, "step": 1190}, {"loss": 1.6084, "grad_norm": 0.5219636559486389, "learning_rate": 0.0002, "epoch": 2.6607538802660753, "step": 1200}, {"loss": 1.6121, "grad_norm": 0.47702011466026306, "learning_rate": 0.0002, "epoch": 2.682926829268293, "step": 1210}, {"loss": 1.6942, "grad_norm": 0.4328458607196808, "learning_rate": 0.0002, "epoch": 2.70509977827051, "step": 1220}, {"loss": 1.6099, "grad_norm": 0.46762076020240784, "learning_rate": 0.0002, "epoch": 2.7272727272727275, "step": 1230}, {"loss": 1.7287, "grad_norm": 0.4592697322368622, "learning_rate": 0.0002, "epoch": 2.7494456762749446, "step": 1240}, {"loss": 1.617, "grad_norm": 0.5519265532493591, "learning_rate": 0.0002, "epoch": 2.7716186252771617, "step": 1250}, {"loss": 1.6868, "grad_norm": 0.47169506549835205, "learning_rate": 0.0002, "epoch": 2.7937915742793793, "step": 1260}, {"loss": 1.658, "grad_norm": 0.47231653332710266, "learning_rate": 0.0002, "epoch": 2.8159645232815964, "step": 1270}, {"loss": 1.6738, "grad_norm": 0.49081969261169434, "learning_rate": 0.0002, "epoch": 2.8381374722838135, "step": 1280}, {"loss": 1.7248, "grad_norm": 0.4483231008052826, "learning_rate": 0.0002, "epoch": 2.860310421286031, "step": 1290}, {"loss": 1.6428, "grad_norm": 0.5310035943984985, "learning_rate": 0.0002, "epoch": 2.882483370288248, "step": 1300}, {"loss": 1.6515, "grad_norm": 0.4419795572757721, "learning_rate": 0.0002, "epoch": 2.9046563192904657, "step": 1310}, {"loss": 1.6956, "grad_norm": 0.44630762934684753, "learning_rate": 0.0002, "epoch": 2.926829268292683, "step": 1320}, {"loss": 1.6464, "grad_norm": 0.39774850010871887, "learning_rate": 0.0002, "epoch": 2.9490022172949004, "step": 1330}, {"loss": 1.7007, "grad_norm": 0.441727876663208, "learning_rate": 0.0002, "epoch": 2.9711751662971175, "step": 1340}, {"loss": 1.5968, "grad_norm": 0.43773892521858215, "learning_rate": 0.0002, "epoch": 2.9933481152993346, "step": 1350}, {"eval_loss": 1.8514760732650757, "eval_runtime": 131.9812, "eval_samples_per_second": 3.902, "eval_steps_per_second": 0.492, "epoch": 3.0, "step": 1353}, {"loss": 1.553, "grad_norm": 0.5274150371551514, "learning_rate": 0.0002, "epoch": 3.015521064301552, "step": 1360}, {"loss": 1.4784, "grad_norm": 0.5724489092826843, "learning_rate": 0.0002, "epoch": 3.0376940133037693, "step": 1370}, {"loss": 1.5365, "grad_norm": 0.6182316541671753, "learning_rate": 0.0002, "epoch": 3.059866962305987, "step": 1380}, {"loss": 1.4824, "grad_norm": 0.5709688067436218, "learning_rate": 0.0002, "epoch": 3.082039911308204, "step": 1390}, {"loss": 1.534, "grad_norm": 0.6368464231491089, "learning_rate": 0.0002, "epoch": 3.104212860310421, "step": 1400}, {"loss": 1.5191, "grad_norm": 0.5680432319641113, "learning_rate": 0.0002, "epoch": 3.1263858093126387, "step": 1410}, {"loss": 1.5258, "grad_norm": 0.5805315375328064, "learning_rate": 0.0002, "epoch": 3.1485587583148558, "step": 1420}, {"loss": 1.612, "grad_norm": 0.5782836675643921, "learning_rate": 0.0002, "epoch": 3.1707317073170733, "step": 1430}, {"loss": 1.4852, "grad_norm": 0.627159595489502, "learning_rate": 0.0002, "epoch": 3.1929046563192904, "step": 1440}, {"loss": 1.5398, "grad_norm": 0.6136751174926758, "learning_rate": 0.0002, "epoch": 3.2150776053215075, "step": 1450}, {"loss": 1.5254, "grad_norm": 0.6319093108177185, "learning_rate": 0.0002, "epoch": 3.237250554323725, "step": 1460}, {"loss": 1.5789, "grad_norm": 0.7641780972480774, "learning_rate": 0.0002, "epoch": 3.259423503325942, "step": 1470}, {"loss": 1.5514, "grad_norm": 0.6116001605987549, "learning_rate": 0.0002, "epoch": 3.2815964523281598, "step": 1480}, {"loss": 1.4647, "grad_norm": 0.6024722456932068, "learning_rate": 0.0002, "epoch": 3.303769401330377, "step": 1490}, {"loss": 1.5561, "grad_norm": 0.5941570997238159, "learning_rate": 0.0002, "epoch": 3.3259423503325944, "step": 1500}, {"loss": 1.5104, "grad_norm": 0.608369767665863, "learning_rate": 0.0002, "epoch": 3.3481152993348116, "step": 1510}, {"loss": 1.5494, "grad_norm": 0.5942065715789795, "learning_rate": 0.0002, "epoch": 3.3702882483370287, "step": 1520}, {"loss": 1.5426, "grad_norm": 0.6382330656051636, "learning_rate": 0.0002, "epoch": 3.3924611973392462, "step": 1530}, {"loss": 1.5479, "grad_norm": 0.5839648842811584, "learning_rate": 0.0002, "epoch": 3.4146341463414633, "step": 1540}, {"loss": 1.5241, "grad_norm": 0.5627358555793762, "learning_rate": 0.0002, "epoch": 3.436807095343681, "step": 1550}, {"loss": 1.5679, "grad_norm": 0.6342151761054993, "learning_rate": 0.0002, "epoch": 3.458980044345898, "step": 1560}, {"loss": 1.5005, "grad_norm": 0.6370542645454407, "learning_rate": 0.0002, "epoch": 3.481152993348115, "step": 1570}, {"loss": 1.541, "grad_norm": 0.5974680185317993, "learning_rate": 0.0002, "epoch": 3.5033259423503327, "step": 1580}, {"loss": 1.553, "grad_norm": 0.6197021007537842, "learning_rate": 0.0002, "epoch": 3.52549889135255, "step": 1590}, {"loss": 1.5287, "grad_norm": 0.6413024067878723, "learning_rate": 0.0002, "epoch": 3.5476718403547673, "step": 1600}, {"loss": 1.5301, "grad_norm": 0.5878410339355469, "learning_rate": 0.0002, "epoch": 3.5698447893569845, "step": 1610}, {"loss": 1.4625, "grad_norm": 0.6485083103179932, "learning_rate": 0.0002, "epoch": 3.5920177383592016, "step": 1620}, {"loss": 1.5373, "grad_norm": 0.5826634764671326, "learning_rate": 0.0002, "epoch": 3.614190687361419, "step": 1630}, {"loss": 1.4952, "grad_norm": 0.8906663656234741, "learning_rate": 0.0002, "epoch": 3.6363636363636362, "step": 1640}, {"loss": 1.5208, "grad_norm": 0.6288479566574097, "learning_rate": 0.0002, "epoch": 3.658536585365854, "step": 1650}, {"loss": 1.6086, "grad_norm": 0.6191049218177795, "learning_rate": 0.0002, "epoch": 3.680709534368071, "step": 1660}, {"loss": 1.5043, "grad_norm": 0.5997978448867798, "learning_rate": 0.0002, "epoch": 3.7028824833702885, "step": 1670}, {"loss": 1.5654, "grad_norm": 0.6003038287162781, "learning_rate": 0.0002, "epoch": 3.7250554323725056, "step": 1680}, {"loss": 1.4941, "grad_norm": 0.5417194962501526, "learning_rate": 0.0002, "epoch": 3.7472283813747227, "step": 1690}, {"loss": 1.5541, "grad_norm": 0.6367442607879639, "learning_rate": 0.0002, "epoch": 3.7694013303769403, "step": 1700}, {"loss": 1.5483, "grad_norm": 0.6613120436668396, "learning_rate": 0.0002, "epoch": 3.7915742793791574, "step": 1710}, {"loss": 1.5999, "grad_norm": 0.6506749391555786, "learning_rate": 0.0002, "epoch": 3.8137472283813745, "step": 1720}, {"loss": 1.5207, "grad_norm": 0.5478500723838806, "learning_rate": 0.0002, "epoch": 3.835920177383592, "step": 1730}, {"loss": 1.5619, "grad_norm": 0.7313215732574463, "learning_rate": 0.0002, "epoch": 3.858093126385809, "step": 1740}, {"loss": 1.4486, "grad_norm": 0.5453857183456421, "learning_rate": 0.0002, "epoch": 3.8802660753880267, "step": 1750}, {"loss": 1.4857, "grad_norm": 0.5983547568321228, "learning_rate": 0.0002, "epoch": 3.902439024390244, "step": 1760}, {"loss": 1.651, "grad_norm": 0.6471580266952515, "learning_rate": 0.0002, "epoch": 3.9246119733924614, "step": 1770}, {"loss": 1.461, "grad_norm": 0.5833685398101807, "learning_rate": 0.0002, "epoch": 3.9467849223946785, "step": 1780}, {"loss": 1.5014, "grad_norm": 0.5509327054023743, "learning_rate": 0.0002, "epoch": 3.9689578713968956, "step": 1790}, {"loss": 1.6225, "grad_norm": 0.6021352410316467, "learning_rate": 0.0002, "epoch": 3.991130820399113, "step": 1800}, {"eval_loss": 1.901047945022583, "eval_runtime": 82.2708, "eval_samples_per_second": 6.26, "eval_steps_per_second": 0.79, "epoch": 4.0, "step": 1804}, {"loss": 1.422, "grad_norm": 0.6232016682624817, "learning_rate": 0.0002, "epoch": 4.013303769401331, "step": 1810}, {"loss": 1.3769, "grad_norm": 0.7521207928657532, "learning_rate": 0.0002, "epoch": 4.035476718403547, "step": 1820}, {"loss": 1.4481, "grad_norm": 0.7839062213897705, "learning_rate": 0.0002, "epoch": 4.057649667405765, "step": 1830}, {"loss": 1.4147, "grad_norm": 0.8654165863990784, "learning_rate": 0.0002, "epoch": 4.0798226164079825, "step": 1840}, {"loss": 1.2983, "grad_norm": 0.6872738599777222, "learning_rate": 0.0002, "epoch": 4.101995565410199, "step": 1850}, {"loss": 1.3115, "grad_norm": 0.7529677748680115, "learning_rate": 0.0002, "epoch": 4.124168514412417, "step": 1860}, {"loss": 1.3869, "grad_norm": 0.835027277469635, "learning_rate": 0.0002, "epoch": 4.146341463414634, "step": 1870}, {"loss": 1.3273, "grad_norm": 0.7457721829414368, "learning_rate": 0.0002, "epoch": 4.168514412416852, "step": 1880}, {"loss": 1.2893, "grad_norm": 0.7366040349006653, "learning_rate": 0.0002, "epoch": 4.1906873614190685, "step": 1890}, {"loss": 1.3615, "grad_norm": 0.7802833914756775, "learning_rate": 0.0002, "epoch": 4.212860310421286, "step": 1900}, {"loss": 1.3607, "grad_norm": 0.7526614665985107, "learning_rate": 0.0002, "epoch": 4.235033259423504, "step": 1910}, {"loss": 1.4384, "grad_norm": 0.7531310319900513, "learning_rate": 0.0002, "epoch": 4.25720620842572, "step": 1920}, {"loss": 1.4074, "grad_norm": 0.8899626135826111, "learning_rate": 0.0002, "epoch": 4.279379157427938, "step": 1930}, {"loss": 1.328, "grad_norm": 0.7591356635093689, "learning_rate": 0.0002, "epoch": 4.301552106430155, "step": 1940}, {"loss": 1.4114, "grad_norm": 0.7126884460449219, "learning_rate": 0.0002, "epoch": 4.323725055432373, "step": 1950}, {"loss": 1.4259, "grad_norm": 0.7907777428627014, "learning_rate": 0.0002, "epoch": 4.34589800443459, "step": 1960}, {"loss": 1.3982, "grad_norm": 0.7854869961738586, "learning_rate": 0.0002, "epoch": 4.368070953436807, "step": 1970}, {"loss": 1.4126, "grad_norm": 0.6982123851776123, "learning_rate": 0.0002, "epoch": 4.390243902439025, "step": 1980}, {"loss": 1.3683, "grad_norm": 0.7551925182342529, "learning_rate": 0.0002, "epoch": 4.412416851441241, "step": 1990}, {"loss": 1.4551, "grad_norm": 0.864078164100647, "learning_rate": 0.0002, "epoch": 4.434589800443459, "step": 2000}, {"loss": 1.3982, "grad_norm": 0.8406776189804077, "learning_rate": 0.0002, "epoch": 4.4567627494456765, "step": 2010}, {"loss": 1.3543, "grad_norm": 0.7706766724586487, "learning_rate": 0.0002, "epoch": 4.478935698447893, "step": 2020}, {"loss": 1.386, "grad_norm": 0.7703949213027954, "learning_rate": 0.0002, "epoch": 4.501108647450111, "step": 2030}, {"loss": 1.4059, "grad_norm": 0.8654166460037231, "learning_rate": 0.0002, "epoch": 4.523281596452328, "step": 2040}, {"loss": 1.4067, "grad_norm": 0.7800114750862122, "learning_rate": 0.0002, "epoch": 4.545454545454545, "step": 2050}, {"loss": 1.3578, "grad_norm": 0.7553898692131042, "learning_rate": 0.0002, "epoch": 4.5676274944567625, "step": 2060}, {"loss": 1.3845, "grad_norm": 0.8689188957214355, "learning_rate": 0.0002, "epoch": 4.58980044345898, "step": 2070}, {"loss": 1.3851, "grad_norm": 0.7244092226028442, "learning_rate": 0.0002, "epoch": 4.611973392461198, "step": 2080}, {"loss": 1.3627, "grad_norm": 0.9829743504524231, "learning_rate": 0.0002, "epoch": 4.634146341463414, "step": 2090}, {"loss": 1.4059, "grad_norm": 0.8026102185249329, "learning_rate": 0.0002, "epoch": 4.656319290465632, "step": 2100}, {"loss": 1.3676, "grad_norm": 0.6725143194198608, "learning_rate": 0.0002, "epoch": 4.678492239467849, "step": 2110}, {"loss": 1.4669, "grad_norm": 0.8055245876312256, "learning_rate": 0.0002, "epoch": 4.700665188470067, "step": 2120}, {"loss": 1.4455, "grad_norm": 0.7507025003433228, "learning_rate": 0.0002, "epoch": 4.722838137472284, "step": 2130}, {"loss": 1.3974, "grad_norm": 0.7166216969490051, "learning_rate": 0.0002, "epoch": 4.745011086474501, "step": 2140}, {"loss": 1.33, "grad_norm": 0.6826853156089783, "learning_rate": 0.0002, "epoch": 4.767184035476719, "step": 2150}, {"loss": 1.3907, "grad_norm": 1.1347891092300415, "learning_rate": 0.0002, "epoch": 4.789356984478935, "step": 2160}, {"loss": 1.3737, "grad_norm": 0.8205971121788025, "learning_rate": 0.0002, "epoch": 4.811529933481153, "step": 2170}, {"loss": 1.3886, "grad_norm": 0.7861950397491455, "learning_rate": 0.0002, "epoch": 4.8337028824833705, "step": 2180}, {"loss": 1.4293, "grad_norm": 0.839460551738739, "learning_rate": 0.0002, "epoch": 4.855875831485587, "step": 2190}, {"loss": 1.3881, "grad_norm": 0.746583878993988, "learning_rate": 0.0002, "epoch": 4.878048780487805, "step": 2200}, {"loss": 1.4519, "grad_norm": 0.7805684804916382, "learning_rate": 0.0002, "epoch": 4.900221729490022, "step": 2210}, {"loss": 1.4053, "grad_norm": 0.8079700469970703, "learning_rate": 0.0002, "epoch": 4.922394678492239, "step": 2220}, {"loss": 1.353, "grad_norm": 0.7609502673149109, "learning_rate": 0.0002, "epoch": 4.9445676274944566, "step": 2230}, {"loss": 1.3816, "grad_norm": 0.7862996459007263, "learning_rate": 0.0002, "epoch": 4.966740576496674, "step": 2240}, {"loss": 1.4249, "grad_norm": 0.778677225112915, "learning_rate": 0.0002, "epoch": 4.988913525498892, "step": 2250}, {"eval_loss": 1.9658271074295044, "eval_runtime": 108.3717, "eval_samples_per_second": 4.752, "eval_steps_per_second": 0.6, "epoch": 5.0, "step": 2255}, {"loss": 1.3395, "grad_norm": 0.7520418167114258, "learning_rate": 0.0002, "epoch": 5.011086474501108, "step": 2260}, {"loss": 1.1909, "grad_norm": 1.1831114292144775, "learning_rate": 0.0002, "epoch": 5.033259423503326, "step": 2270}, {"loss": 1.1784, "grad_norm": 0.8718661069869995, "learning_rate": 0.0002, "epoch": 5.0554323725055434, "step": 2280}, {"loss": 1.2208, "grad_norm": 1.0186705589294434, "learning_rate": 0.0002, "epoch": 5.07760532150776, "step": 2290}, {"loss": 1.2259, "grad_norm": 1.0370045900344849, "learning_rate": 0.0002, "epoch": 5.099778270509978, "step": 2300}, {"loss": 1.1485, "grad_norm": 0.9448253512382507, "learning_rate": 0.0002, "epoch": 5.121951219512195, "step": 2310}, {"loss": 1.1764, "grad_norm": 0.988973081111908, "learning_rate": 0.0002, "epoch": 5.144124168514413, "step": 2320}, {"loss": 1.1544, "grad_norm": 0.9368142485618591, "learning_rate": 0.0002, "epoch": 5.1662971175166295, "step": 2330}, {"loss": 1.2416, "grad_norm": 1.0289298295974731, "learning_rate": 0.0002, "epoch": 5.188470066518847, "step": 2340}, {"loss": 1.1982, "grad_norm": 0.9611420035362244, "learning_rate": 0.0002, "epoch": 5.210643015521065, "step": 2350}, {"loss": 1.2046, "grad_norm": 0.8490312099456787, "learning_rate": 0.0002, "epoch": 5.232815964523281, "step": 2360}, {"loss": 1.2504, "grad_norm": 1.0165891647338867, "learning_rate": 0.0002, "epoch": 5.254988913525499, "step": 2370}, {"loss": 1.2749, "grad_norm": 0.9902606010437012, "learning_rate": 0.0002, "epoch": 5.277161862527716, "step": 2380}, {"loss": 1.2186, "grad_norm": 0.987205445766449, "learning_rate": 0.0002, "epoch": 5.299334811529933, "step": 2390}, {"loss": 1.1962, "grad_norm": 0.7931132316589355, "learning_rate": 0.0002, "epoch": 5.321507760532151, "step": 2400}, {"loss": 1.1661, "grad_norm": 1.143110990524292, "learning_rate": 0.0002, "epoch": 5.343680709534368, "step": 2410}, {"loss": 1.191, "grad_norm": 0.9869807362556458, "learning_rate": 0.0002, "epoch": 5.365853658536586, "step": 2420}, {"loss": 1.208, "grad_norm": 0.9835564494132996, "learning_rate": 0.0002, "epoch": 5.388026607538802, "step": 2430}, {"loss": 1.2734, "grad_norm": 0.8321971893310547, "learning_rate": 0.0002, "epoch": 5.41019955654102, "step": 2440}, {"loss": 1.2308, "grad_norm": 0.8379601240158081, "learning_rate": 0.0002, "epoch": 5.4323725055432375, "step": 2450}, {"loss": 1.2464, "grad_norm": 0.9872745871543884, "learning_rate": 0.0002, "epoch": 5.454545454545454, "step": 2460}, {"loss": 1.283, "grad_norm": 0.9455783367156982, "learning_rate": 0.0002, "epoch": 5.476718403547672, "step": 2470}, {"loss": 1.2585, "grad_norm": 0.9594705700874329, "learning_rate": 0.0002, "epoch": 5.498891352549889, "step": 2480}, {"loss": 1.2776, "grad_norm": 1.036603331565857, "learning_rate": 0.0002, "epoch": 5.521064301552107, "step": 2490}, {"loss": 1.2346, "grad_norm": 1.0329008102416992, "learning_rate": 0.0002, "epoch": 5.5432372505543235, "step": 2500}, {"loss": 1.2202, "grad_norm": 0.90513014793396, "learning_rate": 0.0002, "epoch": 5.565410199556541, "step": 2510}, {"loss": 1.2977, "grad_norm": 1.107680320739746, "learning_rate": 0.0002, "epoch": 5.587583148558759, "step": 2520}, {"loss": 1.2117, "grad_norm": 0.8842377662658691, "learning_rate": 0.0002, "epoch": 5.609756097560975, "step": 2530}, {"loss": 1.2448, "grad_norm": 0.9856716990470886, "learning_rate": 0.0002, "epoch": 5.631929046563193, "step": 2540}, {"loss": 1.2579, "grad_norm": 1.0363198518753052, "learning_rate": 0.0002, "epoch": 5.65410199556541, "step": 2550}, {"loss": 1.236, "grad_norm": 0.9366242289543152, "learning_rate": 0.0002, "epoch": 5.676274944567627, "step": 2560}, {"loss": 1.2652, "grad_norm": 0.9180609583854675, "learning_rate": 0.0002, "epoch": 5.698447893569845, "step": 2570}, {"loss": 1.2153, "grad_norm": 0.96494460105896, "learning_rate": 0.0002, "epoch": 5.720620842572062, "step": 2580}, {"loss": 1.2596, "grad_norm": 1.066856861114502, "learning_rate": 0.0002, "epoch": 5.74279379157428, "step": 2590}, {"loss": 1.2437, "grad_norm": 1.0576446056365967, "learning_rate": 0.0002, "epoch": 5.764966740576496, "step": 2600}, {"loss": 1.2449, "grad_norm": 1.0688375234603882, "learning_rate": 0.0002, "epoch": 5.787139689578714, "step": 2610}, {"loss": 1.2094, "grad_norm": 0.9294432401657104, "learning_rate": 0.0002, "epoch": 5.8093126385809315, "step": 2620}, {"loss": 1.3705, "grad_norm": 0.9467836618423462, "learning_rate": 0.0002, "epoch": 5.831485587583149, "step": 2630}, {"loss": 1.334, "grad_norm": 1.1947448253631592, "learning_rate": 0.0002, "epoch": 5.853658536585366, "step": 2640}, {"loss": 1.1952, "grad_norm": 0.9225861430168152, "learning_rate": 0.0002, "epoch": 5.875831485587583, "step": 2650}, {"loss": 1.3356, "grad_norm": 0.9499539136886597, "learning_rate": 0.0002, "epoch": 5.898004434589801, "step": 2660}, {"loss": 1.2898, "grad_norm": 0.9666298031806946, "learning_rate": 0.0002, "epoch": 5.9201773835920175, "step": 2670}, {"loss": 1.1846, "grad_norm": 1.0549718141555786, "learning_rate": 0.0002, "epoch": 5.942350332594235, "step": 2680}, {"loss": 1.2132, "grad_norm": 1.1662505865097046, "learning_rate": 0.0002, "epoch": 5.964523281596453, "step": 2690}, {"loss": 1.2717, "grad_norm": 0.9200838208198547, "learning_rate": 0.0002, "epoch": 5.986696230598669, "step": 2700}, {"eval_loss": 2.089076280593872, "eval_runtime": 95.2405, "eval_samples_per_second": 5.407, "eval_steps_per_second": 0.682, "epoch": 6.0, "step": 2706}, {"loss": 1.2085, "grad_norm": 1.0047595500946045, "learning_rate": 0.0002, "epoch": 6.008869179600887, "step": 2710}, {"loss": 1.075, "grad_norm": 1.5315641164779663, "learning_rate": 0.0002, "epoch": 6.031042128603104, "step": 2720}, {"loss": 1.0955, "grad_norm": 1.2092695236206055, "learning_rate": 0.0002, "epoch": 6.053215077605321, "step": 2730}, {"loss": 1.108, "grad_norm": 1.1834157705307007, "learning_rate": 0.0002, "epoch": 6.075388026607539, "step": 2740}, {"loss": 1.0148, "grad_norm": 1.2534542083740234, "learning_rate": 0.0002, "epoch": 6.097560975609756, "step": 2750}, {"loss": 1.0422, "grad_norm": 1.2898602485656738, "learning_rate": 0.0002, "epoch": 6.119733924611974, "step": 2760}, {"loss": 1.0363, "grad_norm": 1.3397172689437866, "learning_rate": 0.0002, "epoch": 6.14190687361419, "step": 2770}, {"loss": 1.0651, "grad_norm": 1.18838632106781, "learning_rate": 0.0002, "epoch": 6.164079822616408, "step": 2780}, {"loss": 1.048, "grad_norm": 1.2524046897888184, "learning_rate": 0.0002, "epoch": 6.1862527716186255, "step": 2790}, {"loss": 1.0799, "grad_norm": 1.3325964212417603, "learning_rate": 0.0002, "epoch": 6.208425720620842, "step": 2800}, {"loss": 1.0768, "grad_norm": 1.3972342014312744, "learning_rate": 0.0002, "epoch": 6.23059866962306, "step": 2810}, {"loss": 1.0822, "grad_norm": 1.192122220993042, "learning_rate": 0.0002, "epoch": 6.252771618625277, "step": 2820}, {"loss": 1.0274, "grad_norm": 1.2018429040908813, "learning_rate": 0.0002, "epoch": 6.274944567627495, "step": 2830}, {"loss": 1.045, "grad_norm": 1.2017251253128052, "learning_rate": 0.0002, "epoch": 6.2971175166297115, "step": 2840}, {"loss": 1.0522, "grad_norm": 1.070663332939148, "learning_rate": 0.0002, "epoch": 6.319290465631929, "step": 2850}, {"loss": 1.1084, "grad_norm": 1.2376646995544434, "learning_rate": 0.0002, "epoch": 6.341463414634147, "step": 2860}, {"loss": 1.0885, "grad_norm": 1.4164553880691528, "learning_rate": 0.0002, "epoch": 6.363636363636363, "step": 2870}, {"loss": 1.0519, "grad_norm": 0.9863289594650269, "learning_rate": 0.0002, "epoch": 6.385809312638581, "step": 2880}, {"loss": 0.9746, "grad_norm": 1.1530284881591797, "learning_rate": 0.0002, "epoch": 6.407982261640798, "step": 2890}, {"loss": 1.0414, "grad_norm": 1.3614071607589722, "learning_rate": 0.0002, "epoch": 6.430155210643015, "step": 2900}, {"loss": 1.1097, "grad_norm": 1.4213203191757202, "learning_rate": 0.0002, "epoch": 6.452328159645233, "step": 2910}, {"loss": 1.0551, "grad_norm": 1.3584799766540527, "learning_rate": 0.0002, "epoch": 6.47450110864745, "step": 2920}, {"loss": 1.0888, "grad_norm": 1.1774920225143433, "learning_rate": 0.0002, "epoch": 6.496674057649668, "step": 2930}, {"loss": 1.0806, "grad_norm": 1.5063673257827759, "learning_rate": 0.0002, "epoch": 6.518847006651884, "step": 2940}, {"loss": 1.1157, "grad_norm": 1.3073967695236206, "learning_rate": 0.0002, "epoch": 6.541019955654102, "step": 2950}, {"loss": 1.0853, "grad_norm": 1.2877048254013062, "learning_rate": 0.0002, "epoch": 6.5631929046563195, "step": 2960}, {"loss": 1.0518, "grad_norm": 1.4681131839752197, "learning_rate": 0.0002, "epoch": 6.585365853658536, "step": 2970}, {"loss": 1.1336, "grad_norm": 1.364174246788025, "learning_rate": 0.0002, "epoch": 6.607538802660754, "step": 2980}, {"loss": 1.045, "grad_norm": 1.3069559335708618, "learning_rate": 0.0002, "epoch": 6.629711751662971, "step": 2990}, {"loss": 1.059, "grad_norm": 1.152112364768982, "learning_rate": 0.0002, "epoch": 6.651884700665189, "step": 3000}, {"loss": 1.1065, "grad_norm": 1.3854167461395264, "learning_rate": 0.0002, "epoch": 6.674057649667406, "step": 3010}, {"loss": 1.0792, "grad_norm": 1.3519569635391235, "learning_rate": 0.0002, "epoch": 6.696230598669623, "step": 3020}, {"loss": 1.0858, "grad_norm": 1.253912091255188, "learning_rate": 0.0002, "epoch": 6.718403547671841, "step": 3030}, {"loss": 1.0902, "grad_norm": 1.3960589170455933, "learning_rate": 0.0002, "epoch": 6.740576496674057, "step": 3040}, {"loss": 1.1028, "grad_norm": 1.3538455963134766, "learning_rate": 0.0002, "epoch": 6.762749445676275, "step": 3050}, {"loss": 1.1072, "grad_norm": 1.1728484630584717, "learning_rate": 0.0002, "epoch": 6.7849223946784925, "step": 3060}, {"loss": 1.138, "grad_norm": 1.2287765741348267, "learning_rate": 0.0002, "epoch": 6.807095343680709, "step": 3070}, {"loss": 1.0952, "grad_norm": 1.2122321128845215, "learning_rate": 0.0002, "epoch": 6.829268292682927, "step": 3080}, {"loss": 1.1051, "grad_norm": 1.3517614603042603, "learning_rate": 0.0002, "epoch": 6.851441241685144, "step": 3090}, {"loss": 1.1167, "grad_norm": 1.186508059501648, "learning_rate": 0.0002, "epoch": 6.873614190687362, "step": 3100}, {"loss": 1.1307, "grad_norm": 1.2658056020736694, "learning_rate": 0.0002, "epoch": 6.8957871396895785, "step": 3110}, {"loss": 1.0814, "grad_norm": 1.0459643602371216, "learning_rate": 0.0002, "epoch": 6.917960088691796, "step": 3120}, {"loss": 1.0667, "grad_norm": 1.1218708753585815, "learning_rate": 0.0002, "epoch": 6.940133037694014, "step": 3130}, {"loss": 1.0851, "grad_norm": 1.1161539554595947, "learning_rate": 0.0002, "epoch": 6.96230598669623, "step": 3140}, {"loss": 1.1627, "grad_norm": 1.312601923942566, "learning_rate": 0.0002, "epoch": 6.984478935698448, "step": 3150}]} +{"epoch": 8.0, "step": 3608, "epoch_duration": 1142.2819890975952, "total_accumulated_duration": 10640.011936426163, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-3/checkpoint-902", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.684, "grad_norm": 0.6454975008964539, "learning_rate": 0.0002, "epoch": 0.022172949002217297, "step": 10}, {"loss": 2.2694, "grad_norm": 0.5452715158462524, "learning_rate": 0.0002, "epoch": 0.04434589800443459, "step": 20}, {"loss": 2.0796, "grad_norm": 0.5502195358276367, "learning_rate": 0.0002, "epoch": 0.06651884700665188, "step": 30}, {"loss": 1.9132, "grad_norm": 0.48551198840141296, "learning_rate": 0.0002, "epoch": 0.08869179600886919, "step": 40}, {"loss": 2.016, "grad_norm": 0.47822514176368713, "learning_rate": 0.0002, "epoch": 0.11086474501108648, "step": 50}, {"loss": 1.9455, "grad_norm": 0.5125395655632019, "learning_rate": 0.0002, "epoch": 0.13303769401330376, "step": 60}, {"loss": 1.9017, "grad_norm": 0.4600693881511688, "learning_rate": 0.0002, "epoch": 0.15521064301552107, "step": 70}, {"loss": 2.0057, "grad_norm": 0.7180814743041992, "learning_rate": 0.0002, "epoch": 0.17738359201773837, "step": 80}, {"loss": 1.8925, "grad_norm": 0.4712974429130554, "learning_rate": 0.0002, "epoch": 0.19955654101995565, "step": 90}, {"loss": 1.8386, "grad_norm": 0.4673261344432831, "learning_rate": 0.0002, "epoch": 0.22172949002217296, "step": 100}, {"loss": 1.9346, "grad_norm": 0.4129070043563843, "learning_rate": 0.0002, "epoch": 0.24390243902439024, "step": 110}, {"loss": 1.9018, "grad_norm": 0.3859104812145233, "learning_rate": 0.0002, "epoch": 0.2660753880266075, "step": 120}, {"loss": 1.8922, "grad_norm": 0.40966713428497314, "learning_rate": 0.0002, "epoch": 0.28824833702882485, "step": 130}, {"loss": 1.846, "grad_norm": 0.3685867488384247, "learning_rate": 0.0002, "epoch": 0.31042128603104213, "step": 140}, {"loss": 1.9017, "grad_norm": 0.39279988408088684, "learning_rate": 0.0002, "epoch": 0.3325942350332594, "step": 150}, {"loss": 1.8556, "grad_norm": 0.4195398986339569, "learning_rate": 0.0002, "epoch": 0.35476718403547675, "step": 160}, {"loss": 1.7883, "grad_norm": 0.469802588224411, "learning_rate": 0.0002, "epoch": 0.376940133037694, "step": 170}, {"loss": 1.8135, "grad_norm": 0.4069509208202362, "learning_rate": 0.0002, "epoch": 0.3991130820399113, "step": 180}, {"loss": 1.8429, "grad_norm": 0.47832027077674866, "learning_rate": 0.0002, "epoch": 0.4212860310421286, "step": 190}, {"loss": 1.781, "grad_norm": 0.3376411199569702, "learning_rate": 0.0002, "epoch": 0.4434589800443459, "step": 200}, {"loss": 1.8562, "grad_norm": 0.3787185847759247, "learning_rate": 0.0002, "epoch": 0.4656319290465632, "step": 210}, {"loss": 1.8352, "grad_norm": 0.40322697162628174, "learning_rate": 0.0002, "epoch": 0.4878048780487805, "step": 220}, {"loss": 1.8007, "grad_norm": 0.3710436522960663, "learning_rate": 0.0002, "epoch": 0.5099778270509978, "step": 230}, {"loss": 1.8528, "grad_norm": 0.3723200261592865, "learning_rate": 0.0002, "epoch": 0.532150776053215, "step": 240}, {"loss": 1.852, "grad_norm": 0.3457179069519043, "learning_rate": 0.0002, "epoch": 0.5543237250554324, "step": 250}, {"loss": 1.8175, "grad_norm": 0.35369473695755005, "learning_rate": 0.0002, "epoch": 0.5764966740576497, "step": 260}, {"loss": 1.7742, "grad_norm": 0.3667483329772949, "learning_rate": 0.0002, "epoch": 0.5986696230598669, "step": 270}, {"loss": 1.8152, "grad_norm": 0.4023273289203644, "learning_rate": 0.0002, "epoch": 0.6208425720620843, "step": 280}, {"loss": 1.8382, "grad_norm": 0.3601929843425751, "learning_rate": 0.0002, "epoch": 0.6430155210643016, "step": 290}, {"loss": 1.8152, "grad_norm": 0.32610392570495605, "learning_rate": 0.0002, "epoch": 0.6651884700665188, "step": 300}, {"loss": 1.8412, "grad_norm": 0.40528756380081177, "learning_rate": 0.0002, "epoch": 0.6873614190687362, "step": 310}, {"loss": 1.8528, "grad_norm": 0.34639739990234375, "learning_rate": 0.0002, "epoch": 0.7095343680709535, "step": 320}, {"loss": 1.807, "grad_norm": 0.3794991374015808, "learning_rate": 0.0002, "epoch": 0.7317073170731707, "step": 330}, {"loss": 1.8678, "grad_norm": 0.34203875064849854, "learning_rate": 0.0002, "epoch": 0.753880266075388, "step": 340}, {"loss": 1.7605, "grad_norm": 0.36692821979522705, "learning_rate": 0.0002, "epoch": 0.7760532150776053, "step": 350}, {"loss": 1.7535, "grad_norm": 0.3701125979423523, "learning_rate": 0.0002, "epoch": 0.7982261640798226, "step": 360}, {"loss": 1.7574, "grad_norm": 0.3971416652202606, "learning_rate": 0.0002, "epoch": 0.8203991130820399, "step": 370}, {"loss": 1.8476, "grad_norm": 0.3751989006996155, "learning_rate": 0.0002, "epoch": 0.8425720620842572, "step": 380}, {"loss": 1.8281, "grad_norm": 0.35116496682167053, "learning_rate": 0.0002, "epoch": 0.8647450110864745, "step": 390}, {"loss": 1.7459, "grad_norm": 0.3672674894332886, "learning_rate": 0.0002, "epoch": 0.8869179600886918, "step": 400}, {"loss": 1.8293, "grad_norm": 0.34648260474205017, "learning_rate": 0.0002, "epoch": 0.9090909090909091, "step": 410}, {"loss": 1.7601, "grad_norm": 0.4497389793395996, "learning_rate": 0.0002, "epoch": 0.9312638580931264, "step": 420}, {"loss": 1.8405, "grad_norm": 0.33595147728919983, "learning_rate": 0.0002, "epoch": 0.9534368070953437, "step": 430}, {"loss": 1.8137, "grad_norm": 0.3130456805229187, "learning_rate": 0.0002, "epoch": 0.975609756097561, "step": 440}, {"loss": 1.8226, "grad_norm": 0.36480239033699036, "learning_rate": 0.0002, "epoch": 0.9977827050997783, "step": 450}, {"eval_loss": 1.8323718309402466, "eval_runtime": 79.9603, "eval_samples_per_second": 6.441, "eval_steps_per_second": 0.813, "epoch": 1.0, "step": 451}, {"loss": 1.7921, "grad_norm": 0.3840029835700989, "learning_rate": 0.0002, "epoch": 1.0199556541019956, "step": 460}, {"loss": 1.7057, "grad_norm": 0.33457425236701965, "learning_rate": 0.0002, "epoch": 1.042128603104213, "step": 470}, {"loss": 1.7028, "grad_norm": 0.35766592621803284, "learning_rate": 0.0002, "epoch": 1.06430155210643, "step": 480}, {"loss": 1.7868, "grad_norm": 0.38070937991142273, "learning_rate": 0.0002, "epoch": 1.0864745011086474, "step": 490}, {"loss": 1.72, "grad_norm": 0.38546547293663025, "learning_rate": 0.0002, "epoch": 1.1086474501108647, "step": 500}, {"loss": 1.7088, "grad_norm": 0.384104460477829, "learning_rate": 0.0002, "epoch": 1.130820399113082, "step": 510}, {"loss": 1.7779, "grad_norm": 0.3556116819381714, "learning_rate": 0.0002, "epoch": 1.1529933481152994, "step": 520}, {"loss": 1.7538, "grad_norm": 0.4110541343688965, "learning_rate": 0.0002, "epoch": 1.1751662971175167, "step": 530}, {"loss": 1.7608, "grad_norm": 0.46503177285194397, "learning_rate": 0.0002, "epoch": 1.1973392461197339, "step": 540}, {"loss": 1.729, "grad_norm": 0.4366816580295563, "learning_rate": 0.0002, "epoch": 1.2195121951219512, "step": 550}, {"loss": 1.7954, "grad_norm": 0.379986047744751, "learning_rate": 0.0002, "epoch": 1.2416851441241685, "step": 560}, {"loss": 1.6852, "grad_norm": 0.3920869529247284, "learning_rate": 0.0002, "epoch": 1.2638580931263859, "step": 570}, {"loss": 1.8265, "grad_norm": 0.4013986587524414, "learning_rate": 0.0002, "epoch": 1.2860310421286032, "step": 580}, {"loss": 1.7294, "grad_norm": 0.39104390144348145, "learning_rate": 0.0002, "epoch": 1.3082039911308203, "step": 590}, {"loss": 1.7822, "grad_norm": 0.40515613555908203, "learning_rate": 0.0002, "epoch": 1.3303769401330376, "step": 600}, {"loss": 1.7614, "grad_norm": 0.4212331473827362, "learning_rate": 0.0002, "epoch": 1.352549889135255, "step": 610}, {"loss": 1.7883, "grad_norm": 0.36040815711021423, "learning_rate": 0.0002, "epoch": 1.3747228381374723, "step": 620}, {"loss": 1.7467, "grad_norm": 0.3950865864753723, "learning_rate": 0.0002, "epoch": 1.3968957871396896, "step": 630}, {"loss": 1.7242, "grad_norm": 0.3934709131717682, "learning_rate": 0.0002, "epoch": 1.4190687361419068, "step": 640}, {"loss": 1.7783, "grad_norm": 0.3905350863933563, "learning_rate": 0.0002, "epoch": 1.441241685144124, "step": 650}, {"loss": 1.7612, "grad_norm": 0.4322686493396759, "learning_rate": 0.0002, "epoch": 1.4634146341463414, "step": 660}, {"loss": 1.7357, "grad_norm": 0.35697034001350403, "learning_rate": 0.0002, "epoch": 1.4855875831485588, "step": 670}, {"loss": 1.6764, "grad_norm": 0.38570451736450195, "learning_rate": 0.0002, "epoch": 1.507760532150776, "step": 680}, {"loss": 1.7054, "grad_norm": 0.3804517090320587, "learning_rate": 0.0002, "epoch": 1.5299334811529932, "step": 690}, {"loss": 1.7725, "grad_norm": 0.4938165247440338, "learning_rate": 0.0002, "epoch": 1.5521064301552108, "step": 700}, {"loss": 1.7982, "grad_norm": 0.43075236678123474, "learning_rate": 0.0002, "epoch": 1.5742793791574279, "step": 710}, {"loss": 1.7741, "grad_norm": 0.40434643626213074, "learning_rate": 0.0002, "epoch": 1.5964523281596452, "step": 720}, {"loss": 1.7122, "grad_norm": 0.3874157667160034, "learning_rate": 0.0002, "epoch": 1.6186252771618626, "step": 730}, {"loss": 1.8141, "grad_norm": 0.3645969331264496, "learning_rate": 0.0002, "epoch": 1.6407982261640797, "step": 740}, {"loss": 1.7868, "grad_norm": 0.38588255643844604, "learning_rate": 0.0002, "epoch": 1.6629711751662972, "step": 750}, {"loss": 1.7847, "grad_norm": 0.39252519607543945, "learning_rate": 0.0002, "epoch": 1.6851441241685143, "step": 760}, {"loss": 1.7424, "grad_norm": 0.36165162920951843, "learning_rate": 0.0002, "epoch": 1.7073170731707317, "step": 770}, {"loss": 1.7172, "grad_norm": 0.36677947640419006, "learning_rate": 0.0002, "epoch": 1.729490022172949, "step": 780}, {"loss": 1.7737, "grad_norm": 0.374881774187088, "learning_rate": 0.0002, "epoch": 1.7516629711751663, "step": 790}, {"loss": 1.7331, "grad_norm": 0.4530802369117737, "learning_rate": 0.0002, "epoch": 1.7738359201773837, "step": 800}, {"loss": 1.7799, "grad_norm": 0.3879568576812744, "learning_rate": 0.0002, "epoch": 1.7960088691796008, "step": 810}, {"loss": 1.7562, "grad_norm": 0.3710079789161682, "learning_rate": 0.0002, "epoch": 1.8181818181818183, "step": 820}, {"loss": 1.7728, "grad_norm": 0.3831799030303955, "learning_rate": 0.0002, "epoch": 1.8403547671840355, "step": 830}, {"loss": 1.7605, "grad_norm": 0.3958432376384735, "learning_rate": 0.0002, "epoch": 1.8625277161862528, "step": 840}, {"loss": 1.8187, "grad_norm": 0.4129294157028198, "learning_rate": 0.0002, "epoch": 1.8847006651884701, "step": 850}, {"loss": 1.6506, "grad_norm": 0.3714745044708252, "learning_rate": 0.0002, "epoch": 1.9068736141906872, "step": 860}, {"loss": 1.707, "grad_norm": 0.40176868438720703, "learning_rate": 0.0002, "epoch": 1.9290465631929048, "step": 870}, {"loss": 1.7557, "grad_norm": 0.36937767267227173, "learning_rate": 0.0002, "epoch": 1.951219512195122, "step": 880}, {"loss": 1.7353, "grad_norm": 0.40242597460746765, "learning_rate": 0.0002, "epoch": 1.9733924611973392, "step": 890}, {"loss": 1.7402, "grad_norm": 0.3515510559082031, "learning_rate": 0.0002, "epoch": 1.9955654101995566, "step": 900}, {"eval_loss": 1.8243104219436646, "eval_runtime": 107.8856, "eval_samples_per_second": 4.774, "eval_steps_per_second": 0.602, "epoch": 2.0, "step": 902}, {"loss": 1.6631, "grad_norm": 0.4145216643810272, "learning_rate": 0.0002, "epoch": 2.0177383592017737, "step": 910}, {"loss": 1.7227, "grad_norm": 0.42423519492149353, "learning_rate": 0.0002, "epoch": 2.0399113082039912, "step": 920}, {"loss": 1.6357, "grad_norm": 0.4773229956626892, "learning_rate": 0.0002, "epoch": 2.0620842572062084, "step": 930}, {"loss": 1.671, "grad_norm": 0.4144791066646576, "learning_rate": 0.0002, "epoch": 2.084257206208426, "step": 940}, {"loss": 1.6433, "grad_norm": 0.42704132199287415, "learning_rate": 0.0002, "epoch": 2.106430155210643, "step": 950}, {"loss": 1.6767, "grad_norm": 0.4479042589664459, "learning_rate": 0.0002, "epoch": 2.12860310421286, "step": 960}, {"loss": 1.6122, "grad_norm": 0.4810638129711151, "learning_rate": 0.0002, "epoch": 2.1507760532150777, "step": 970}, {"loss": 1.6613, "grad_norm": 0.48669910430908203, "learning_rate": 0.0002, "epoch": 2.172949002217295, "step": 980}, {"loss": 1.6274, "grad_norm": 0.4252761900424957, "learning_rate": 0.0002, "epoch": 2.1951219512195124, "step": 990}, {"loss": 1.6514, "grad_norm": 0.42342790961265564, "learning_rate": 0.0002, "epoch": 2.2172949002217295, "step": 1000}, {"loss": 1.637, "grad_norm": 0.43432456254959106, "learning_rate": 0.0002, "epoch": 2.2394678492239466, "step": 1010}, {"loss": 1.6856, "grad_norm": 0.45556965470314026, "learning_rate": 0.0002, "epoch": 2.261640798226164, "step": 1020}, {"loss": 1.6554, "grad_norm": 0.48035719990730286, "learning_rate": 0.0002, "epoch": 2.2838137472283813, "step": 1030}, {"loss": 1.6947, "grad_norm": 0.4233241081237793, "learning_rate": 0.0002, "epoch": 2.305986696230599, "step": 1040}, {"loss": 1.6501, "grad_norm": 0.3918434679508209, "learning_rate": 0.0002, "epoch": 2.328159645232816, "step": 1050}, {"loss": 1.6903, "grad_norm": 0.44049757719039917, "learning_rate": 0.0002, "epoch": 2.3503325942350335, "step": 1060}, {"loss": 1.6337, "grad_norm": 0.4730056822299957, "learning_rate": 0.0002, "epoch": 2.3725055432372506, "step": 1070}, {"loss": 1.6104, "grad_norm": 0.4354589581489563, "learning_rate": 0.0002, "epoch": 2.3946784922394677, "step": 1080}, {"loss": 1.6496, "grad_norm": 0.4837590456008911, "learning_rate": 0.0002, "epoch": 2.4168514412416853, "step": 1090}, {"loss": 1.6672, "grad_norm": 0.4842571020126343, "learning_rate": 0.0002, "epoch": 2.4390243902439024, "step": 1100}, {"loss": 1.6091, "grad_norm": 0.46398279070854187, "learning_rate": 0.0002, "epoch": 2.4611973392461195, "step": 1110}, {"loss": 1.6393, "grad_norm": 0.4587327539920807, "learning_rate": 0.0002, "epoch": 2.483370288248337, "step": 1120}, {"loss": 1.621, "grad_norm": 0.4336528480052948, "learning_rate": 0.0002, "epoch": 2.505543237250554, "step": 1130}, {"loss": 1.6199, "grad_norm": 0.6162153482437134, "learning_rate": 0.0002, "epoch": 2.5277161862527717, "step": 1140}, {"loss": 1.7244, "grad_norm": 0.48175573348999023, "learning_rate": 0.0002, "epoch": 2.549889135254989, "step": 1150}, {"loss": 1.6098, "grad_norm": 0.448272705078125, "learning_rate": 0.0002, "epoch": 2.5720620842572064, "step": 1160}, {"loss": 1.6987, "grad_norm": 0.5189200639724731, "learning_rate": 0.0002, "epoch": 2.5942350332594235, "step": 1170}, {"loss": 1.6503, "grad_norm": 0.45032963156700134, "learning_rate": 0.0002, "epoch": 2.6164079822616406, "step": 1180}, {"loss": 1.6508, "grad_norm": 0.4417729377746582, "learning_rate": 0.0002, "epoch": 2.638580931263858, "step": 1190}, {"loss": 1.6084, "grad_norm": 0.5219636559486389, "learning_rate": 0.0002, "epoch": 2.6607538802660753, "step": 1200}, {"loss": 1.6121, "grad_norm": 0.47702011466026306, "learning_rate": 0.0002, "epoch": 2.682926829268293, "step": 1210}, {"loss": 1.6942, "grad_norm": 0.4328458607196808, "learning_rate": 0.0002, "epoch": 2.70509977827051, "step": 1220}, {"loss": 1.6099, "grad_norm": 0.46762076020240784, "learning_rate": 0.0002, "epoch": 2.7272727272727275, "step": 1230}, {"loss": 1.7287, "grad_norm": 0.4592697322368622, "learning_rate": 0.0002, "epoch": 2.7494456762749446, "step": 1240}, {"loss": 1.617, "grad_norm": 0.5519265532493591, "learning_rate": 0.0002, "epoch": 2.7716186252771617, "step": 1250}, {"loss": 1.6868, "grad_norm": 0.47169506549835205, "learning_rate": 0.0002, "epoch": 2.7937915742793793, "step": 1260}, {"loss": 1.658, "grad_norm": 0.47231653332710266, "learning_rate": 0.0002, "epoch": 2.8159645232815964, "step": 1270}, {"loss": 1.6738, "grad_norm": 0.49081969261169434, "learning_rate": 0.0002, "epoch": 2.8381374722838135, "step": 1280}, {"loss": 1.7248, "grad_norm": 0.4483231008052826, "learning_rate": 0.0002, "epoch": 2.860310421286031, "step": 1290}, {"loss": 1.6428, "grad_norm": 0.5310035943984985, "learning_rate": 0.0002, "epoch": 2.882483370288248, "step": 1300}, {"loss": 1.6515, "grad_norm": 0.4419795572757721, "learning_rate": 0.0002, "epoch": 2.9046563192904657, "step": 1310}, {"loss": 1.6956, "grad_norm": 0.44630762934684753, "learning_rate": 0.0002, "epoch": 2.926829268292683, "step": 1320}, {"loss": 1.6464, "grad_norm": 0.39774850010871887, "learning_rate": 0.0002, "epoch": 2.9490022172949004, "step": 1330}, {"loss": 1.7007, "grad_norm": 0.441727876663208, "learning_rate": 0.0002, "epoch": 2.9711751662971175, "step": 1340}, {"loss": 1.5968, "grad_norm": 0.43773892521858215, "learning_rate": 0.0002, "epoch": 2.9933481152993346, "step": 1350}, {"eval_loss": 1.8514760732650757, "eval_runtime": 131.9812, "eval_samples_per_second": 3.902, "eval_steps_per_second": 0.492, "epoch": 3.0, "step": 1353}, {"loss": 1.553, "grad_norm": 0.5274150371551514, "learning_rate": 0.0002, "epoch": 3.015521064301552, "step": 1360}, {"loss": 1.4784, "grad_norm": 0.5724489092826843, "learning_rate": 0.0002, "epoch": 3.0376940133037693, "step": 1370}, {"loss": 1.5365, "grad_norm": 0.6182316541671753, "learning_rate": 0.0002, "epoch": 3.059866962305987, "step": 1380}, {"loss": 1.4824, "grad_norm": 0.5709688067436218, "learning_rate": 0.0002, "epoch": 3.082039911308204, "step": 1390}, {"loss": 1.534, "grad_norm": 0.6368464231491089, "learning_rate": 0.0002, "epoch": 3.104212860310421, "step": 1400}, {"loss": 1.5191, "grad_norm": 0.5680432319641113, "learning_rate": 0.0002, "epoch": 3.1263858093126387, "step": 1410}, {"loss": 1.5258, "grad_norm": 0.5805315375328064, "learning_rate": 0.0002, "epoch": 3.1485587583148558, "step": 1420}, {"loss": 1.612, "grad_norm": 0.5782836675643921, "learning_rate": 0.0002, "epoch": 3.1707317073170733, "step": 1430}, {"loss": 1.4852, "grad_norm": 0.627159595489502, "learning_rate": 0.0002, "epoch": 3.1929046563192904, "step": 1440}, {"loss": 1.5398, "grad_norm": 0.6136751174926758, "learning_rate": 0.0002, "epoch": 3.2150776053215075, "step": 1450}, {"loss": 1.5254, "grad_norm": 0.6319093108177185, "learning_rate": 0.0002, "epoch": 3.237250554323725, "step": 1460}, {"loss": 1.5789, "grad_norm": 0.7641780972480774, "learning_rate": 0.0002, "epoch": 3.259423503325942, "step": 1470}, {"loss": 1.5514, "grad_norm": 0.6116001605987549, "learning_rate": 0.0002, "epoch": 3.2815964523281598, "step": 1480}, {"loss": 1.4647, "grad_norm": 0.6024722456932068, "learning_rate": 0.0002, "epoch": 3.303769401330377, "step": 1490}, {"loss": 1.5561, "grad_norm": 0.5941570997238159, "learning_rate": 0.0002, "epoch": 3.3259423503325944, "step": 1500}, {"loss": 1.5104, "grad_norm": 0.608369767665863, "learning_rate": 0.0002, "epoch": 3.3481152993348116, "step": 1510}, {"loss": 1.5494, "grad_norm": 0.5942065715789795, "learning_rate": 0.0002, "epoch": 3.3702882483370287, "step": 1520}, {"loss": 1.5426, "grad_norm": 0.6382330656051636, "learning_rate": 0.0002, "epoch": 3.3924611973392462, "step": 1530}, {"loss": 1.5479, "grad_norm": 0.5839648842811584, "learning_rate": 0.0002, "epoch": 3.4146341463414633, "step": 1540}, {"loss": 1.5241, "grad_norm": 0.5627358555793762, "learning_rate": 0.0002, "epoch": 3.436807095343681, "step": 1550}, {"loss": 1.5679, "grad_norm": 0.6342151761054993, "learning_rate": 0.0002, "epoch": 3.458980044345898, "step": 1560}, {"loss": 1.5005, "grad_norm": 0.6370542645454407, "learning_rate": 0.0002, "epoch": 3.481152993348115, "step": 1570}, {"loss": 1.541, "grad_norm": 0.5974680185317993, "learning_rate": 0.0002, "epoch": 3.5033259423503327, "step": 1580}, {"loss": 1.553, "grad_norm": 0.6197021007537842, "learning_rate": 0.0002, "epoch": 3.52549889135255, "step": 1590}, {"loss": 1.5287, "grad_norm": 0.6413024067878723, "learning_rate": 0.0002, "epoch": 3.5476718403547673, "step": 1600}, {"loss": 1.5301, "grad_norm": 0.5878410339355469, "learning_rate": 0.0002, "epoch": 3.5698447893569845, "step": 1610}, {"loss": 1.4625, "grad_norm": 0.6485083103179932, "learning_rate": 0.0002, "epoch": 3.5920177383592016, "step": 1620}, {"loss": 1.5373, "grad_norm": 0.5826634764671326, "learning_rate": 0.0002, "epoch": 3.614190687361419, "step": 1630}, {"loss": 1.4952, "grad_norm": 0.8906663656234741, "learning_rate": 0.0002, "epoch": 3.6363636363636362, "step": 1640}, {"loss": 1.5208, "grad_norm": 0.6288479566574097, "learning_rate": 0.0002, "epoch": 3.658536585365854, "step": 1650}, {"loss": 1.6086, "grad_norm": 0.6191049218177795, "learning_rate": 0.0002, "epoch": 3.680709534368071, "step": 1660}, {"loss": 1.5043, "grad_norm": 0.5997978448867798, "learning_rate": 0.0002, "epoch": 3.7028824833702885, "step": 1670}, {"loss": 1.5654, "grad_norm": 0.6003038287162781, "learning_rate": 0.0002, "epoch": 3.7250554323725056, "step": 1680}, {"loss": 1.4941, "grad_norm": 0.5417194962501526, "learning_rate": 0.0002, "epoch": 3.7472283813747227, "step": 1690}, {"loss": 1.5541, "grad_norm": 0.6367442607879639, "learning_rate": 0.0002, "epoch": 3.7694013303769403, "step": 1700}, {"loss": 1.5483, "grad_norm": 0.6613120436668396, "learning_rate": 0.0002, "epoch": 3.7915742793791574, "step": 1710}, {"loss": 1.5999, "grad_norm": 0.6506749391555786, "learning_rate": 0.0002, "epoch": 3.8137472283813745, "step": 1720}, {"loss": 1.5207, "grad_norm": 0.5478500723838806, "learning_rate": 0.0002, "epoch": 3.835920177383592, "step": 1730}, {"loss": 1.5619, "grad_norm": 0.7313215732574463, "learning_rate": 0.0002, "epoch": 3.858093126385809, "step": 1740}, {"loss": 1.4486, "grad_norm": 0.5453857183456421, "learning_rate": 0.0002, "epoch": 3.8802660753880267, "step": 1750}, {"loss": 1.4857, "grad_norm": 0.5983547568321228, "learning_rate": 0.0002, "epoch": 3.902439024390244, "step": 1760}, {"loss": 1.651, "grad_norm": 0.6471580266952515, "learning_rate": 0.0002, "epoch": 3.9246119733924614, "step": 1770}, {"loss": 1.461, "grad_norm": 0.5833685398101807, "learning_rate": 0.0002, "epoch": 3.9467849223946785, "step": 1780}, {"loss": 1.5014, "grad_norm": 0.5509327054023743, "learning_rate": 0.0002, "epoch": 3.9689578713968956, "step": 1790}, {"loss": 1.6225, "grad_norm": 0.6021352410316467, "learning_rate": 0.0002, "epoch": 3.991130820399113, "step": 1800}, {"eval_loss": 1.901047945022583, "eval_runtime": 82.2708, "eval_samples_per_second": 6.26, "eval_steps_per_second": 0.79, "epoch": 4.0, "step": 1804}, {"loss": 1.422, "grad_norm": 0.6232016682624817, "learning_rate": 0.0002, "epoch": 4.013303769401331, "step": 1810}, {"loss": 1.3769, "grad_norm": 0.7521207928657532, "learning_rate": 0.0002, "epoch": 4.035476718403547, "step": 1820}, {"loss": 1.4481, "grad_norm": 0.7839062213897705, "learning_rate": 0.0002, "epoch": 4.057649667405765, "step": 1830}, {"loss": 1.4147, "grad_norm": 0.8654165863990784, "learning_rate": 0.0002, "epoch": 4.0798226164079825, "step": 1840}, {"loss": 1.2983, "grad_norm": 0.6872738599777222, "learning_rate": 0.0002, "epoch": 4.101995565410199, "step": 1850}, {"loss": 1.3115, "grad_norm": 0.7529677748680115, "learning_rate": 0.0002, "epoch": 4.124168514412417, "step": 1860}, {"loss": 1.3869, "grad_norm": 0.835027277469635, "learning_rate": 0.0002, "epoch": 4.146341463414634, "step": 1870}, {"loss": 1.3273, "grad_norm": 0.7457721829414368, "learning_rate": 0.0002, "epoch": 4.168514412416852, "step": 1880}, {"loss": 1.2893, "grad_norm": 0.7366040349006653, "learning_rate": 0.0002, "epoch": 4.1906873614190685, "step": 1890}, {"loss": 1.3615, "grad_norm": 0.7802833914756775, "learning_rate": 0.0002, "epoch": 4.212860310421286, "step": 1900}, {"loss": 1.3607, "grad_norm": 0.7526614665985107, "learning_rate": 0.0002, "epoch": 4.235033259423504, "step": 1910}, {"loss": 1.4384, "grad_norm": 0.7531310319900513, "learning_rate": 0.0002, "epoch": 4.25720620842572, "step": 1920}, {"loss": 1.4074, "grad_norm": 0.8899626135826111, "learning_rate": 0.0002, "epoch": 4.279379157427938, "step": 1930}, {"loss": 1.328, "grad_norm": 0.7591356635093689, "learning_rate": 0.0002, "epoch": 4.301552106430155, "step": 1940}, {"loss": 1.4114, "grad_norm": 0.7126884460449219, "learning_rate": 0.0002, "epoch": 4.323725055432373, "step": 1950}, {"loss": 1.4259, "grad_norm": 0.7907777428627014, "learning_rate": 0.0002, "epoch": 4.34589800443459, "step": 1960}, {"loss": 1.3982, "grad_norm": 0.7854869961738586, "learning_rate": 0.0002, "epoch": 4.368070953436807, "step": 1970}, {"loss": 1.4126, "grad_norm": 0.6982123851776123, "learning_rate": 0.0002, "epoch": 4.390243902439025, "step": 1980}, {"loss": 1.3683, "grad_norm": 0.7551925182342529, "learning_rate": 0.0002, "epoch": 4.412416851441241, "step": 1990}, {"loss": 1.4551, "grad_norm": 0.864078164100647, "learning_rate": 0.0002, "epoch": 4.434589800443459, "step": 2000}, {"loss": 1.3982, "grad_norm": 0.8406776189804077, "learning_rate": 0.0002, "epoch": 4.4567627494456765, "step": 2010}, {"loss": 1.3543, "grad_norm": 0.7706766724586487, "learning_rate": 0.0002, "epoch": 4.478935698447893, "step": 2020}, {"loss": 1.386, "grad_norm": 0.7703949213027954, "learning_rate": 0.0002, "epoch": 4.501108647450111, "step": 2030}, {"loss": 1.4059, "grad_norm": 0.8654166460037231, "learning_rate": 0.0002, "epoch": 4.523281596452328, "step": 2040}, {"loss": 1.4067, "grad_norm": 0.7800114750862122, "learning_rate": 0.0002, "epoch": 4.545454545454545, "step": 2050}, {"loss": 1.3578, "grad_norm": 0.7553898692131042, "learning_rate": 0.0002, "epoch": 4.5676274944567625, "step": 2060}, {"loss": 1.3845, "grad_norm": 0.8689188957214355, "learning_rate": 0.0002, "epoch": 4.58980044345898, "step": 2070}, {"loss": 1.3851, "grad_norm": 0.7244092226028442, "learning_rate": 0.0002, "epoch": 4.611973392461198, "step": 2080}, {"loss": 1.3627, "grad_norm": 0.9829743504524231, "learning_rate": 0.0002, "epoch": 4.634146341463414, "step": 2090}, {"loss": 1.4059, "grad_norm": 0.8026102185249329, "learning_rate": 0.0002, "epoch": 4.656319290465632, "step": 2100}, {"loss": 1.3676, "grad_norm": 0.6725143194198608, "learning_rate": 0.0002, "epoch": 4.678492239467849, "step": 2110}, {"loss": 1.4669, "grad_norm": 0.8055245876312256, "learning_rate": 0.0002, "epoch": 4.700665188470067, "step": 2120}, {"loss": 1.4455, "grad_norm": 0.7507025003433228, "learning_rate": 0.0002, "epoch": 4.722838137472284, "step": 2130}, {"loss": 1.3974, "grad_norm": 0.7166216969490051, "learning_rate": 0.0002, "epoch": 4.745011086474501, "step": 2140}, {"loss": 1.33, "grad_norm": 0.6826853156089783, "learning_rate": 0.0002, "epoch": 4.767184035476719, "step": 2150}, {"loss": 1.3907, "grad_norm": 1.1347891092300415, "learning_rate": 0.0002, "epoch": 4.789356984478935, "step": 2160}, {"loss": 1.3737, "grad_norm": 0.8205971121788025, "learning_rate": 0.0002, "epoch": 4.811529933481153, "step": 2170}, {"loss": 1.3886, "grad_norm": 0.7861950397491455, "learning_rate": 0.0002, "epoch": 4.8337028824833705, "step": 2180}, {"loss": 1.4293, "grad_norm": 0.839460551738739, "learning_rate": 0.0002, "epoch": 4.855875831485587, "step": 2190}, {"loss": 1.3881, "grad_norm": 0.746583878993988, "learning_rate": 0.0002, "epoch": 4.878048780487805, "step": 2200}, {"loss": 1.4519, "grad_norm": 0.7805684804916382, "learning_rate": 0.0002, "epoch": 4.900221729490022, "step": 2210}, {"loss": 1.4053, "grad_norm": 0.8079700469970703, "learning_rate": 0.0002, "epoch": 4.922394678492239, "step": 2220}, {"loss": 1.353, "grad_norm": 0.7609502673149109, "learning_rate": 0.0002, "epoch": 4.9445676274944566, "step": 2230}, {"loss": 1.3816, "grad_norm": 0.7862996459007263, "learning_rate": 0.0002, "epoch": 4.966740576496674, "step": 2240}, {"loss": 1.4249, "grad_norm": 0.778677225112915, "learning_rate": 0.0002, "epoch": 4.988913525498892, "step": 2250}, {"eval_loss": 1.9658271074295044, "eval_runtime": 108.3717, "eval_samples_per_second": 4.752, "eval_steps_per_second": 0.6, "epoch": 5.0, "step": 2255}, {"loss": 1.3395, "grad_norm": 0.7520418167114258, "learning_rate": 0.0002, "epoch": 5.011086474501108, "step": 2260}, {"loss": 1.1909, "grad_norm": 1.1831114292144775, "learning_rate": 0.0002, "epoch": 5.033259423503326, "step": 2270}, {"loss": 1.1784, "grad_norm": 0.8718661069869995, "learning_rate": 0.0002, "epoch": 5.0554323725055434, "step": 2280}, {"loss": 1.2208, "grad_norm": 1.0186705589294434, "learning_rate": 0.0002, "epoch": 5.07760532150776, "step": 2290}, {"loss": 1.2259, "grad_norm": 1.0370045900344849, "learning_rate": 0.0002, "epoch": 5.099778270509978, "step": 2300}, {"loss": 1.1485, "grad_norm": 0.9448253512382507, "learning_rate": 0.0002, "epoch": 5.121951219512195, "step": 2310}, {"loss": 1.1764, "grad_norm": 0.988973081111908, "learning_rate": 0.0002, "epoch": 5.144124168514413, "step": 2320}, {"loss": 1.1544, "grad_norm": 0.9368142485618591, "learning_rate": 0.0002, "epoch": 5.1662971175166295, "step": 2330}, {"loss": 1.2416, "grad_norm": 1.0289298295974731, "learning_rate": 0.0002, "epoch": 5.188470066518847, "step": 2340}, {"loss": 1.1982, "grad_norm": 0.9611420035362244, "learning_rate": 0.0002, "epoch": 5.210643015521065, "step": 2350}, {"loss": 1.2046, "grad_norm": 0.8490312099456787, "learning_rate": 0.0002, "epoch": 5.232815964523281, "step": 2360}, {"loss": 1.2504, "grad_norm": 1.0165891647338867, "learning_rate": 0.0002, "epoch": 5.254988913525499, "step": 2370}, {"loss": 1.2749, "grad_norm": 0.9902606010437012, "learning_rate": 0.0002, "epoch": 5.277161862527716, "step": 2380}, {"loss": 1.2186, "grad_norm": 0.987205445766449, "learning_rate": 0.0002, "epoch": 5.299334811529933, "step": 2390}, {"loss": 1.1962, "grad_norm": 0.7931132316589355, "learning_rate": 0.0002, "epoch": 5.321507760532151, "step": 2400}, {"loss": 1.1661, "grad_norm": 1.143110990524292, "learning_rate": 0.0002, "epoch": 5.343680709534368, "step": 2410}, {"loss": 1.191, "grad_norm": 0.9869807362556458, "learning_rate": 0.0002, "epoch": 5.365853658536586, "step": 2420}, {"loss": 1.208, "grad_norm": 0.9835564494132996, "learning_rate": 0.0002, "epoch": 5.388026607538802, "step": 2430}, {"loss": 1.2734, "grad_norm": 0.8321971893310547, "learning_rate": 0.0002, "epoch": 5.41019955654102, "step": 2440}, {"loss": 1.2308, "grad_norm": 0.8379601240158081, "learning_rate": 0.0002, "epoch": 5.4323725055432375, "step": 2450}, {"loss": 1.2464, "grad_norm": 0.9872745871543884, "learning_rate": 0.0002, "epoch": 5.454545454545454, "step": 2460}, {"loss": 1.283, "grad_norm": 0.9455783367156982, "learning_rate": 0.0002, "epoch": 5.476718403547672, "step": 2470}, {"loss": 1.2585, "grad_norm": 0.9594705700874329, "learning_rate": 0.0002, "epoch": 5.498891352549889, "step": 2480}, {"loss": 1.2776, "grad_norm": 1.036603331565857, "learning_rate": 0.0002, "epoch": 5.521064301552107, "step": 2490}, {"loss": 1.2346, "grad_norm": 1.0329008102416992, "learning_rate": 0.0002, "epoch": 5.5432372505543235, "step": 2500}, {"loss": 1.2202, "grad_norm": 0.90513014793396, "learning_rate": 0.0002, "epoch": 5.565410199556541, "step": 2510}, {"loss": 1.2977, "grad_norm": 1.107680320739746, "learning_rate": 0.0002, "epoch": 5.587583148558759, "step": 2520}, {"loss": 1.2117, "grad_norm": 0.8842377662658691, "learning_rate": 0.0002, "epoch": 5.609756097560975, "step": 2530}, {"loss": 1.2448, "grad_norm": 0.9856716990470886, "learning_rate": 0.0002, "epoch": 5.631929046563193, "step": 2540}, {"loss": 1.2579, "grad_norm": 1.0363198518753052, "learning_rate": 0.0002, "epoch": 5.65410199556541, "step": 2550}, {"loss": 1.236, "grad_norm": 0.9366242289543152, "learning_rate": 0.0002, "epoch": 5.676274944567627, "step": 2560}, {"loss": 1.2652, "grad_norm": 0.9180609583854675, "learning_rate": 0.0002, "epoch": 5.698447893569845, "step": 2570}, {"loss": 1.2153, "grad_norm": 0.96494460105896, "learning_rate": 0.0002, "epoch": 5.720620842572062, "step": 2580}, {"loss": 1.2596, "grad_norm": 1.066856861114502, "learning_rate": 0.0002, "epoch": 5.74279379157428, "step": 2590}, {"loss": 1.2437, "grad_norm": 1.0576446056365967, "learning_rate": 0.0002, "epoch": 5.764966740576496, "step": 2600}, {"loss": 1.2449, "grad_norm": 1.0688375234603882, "learning_rate": 0.0002, "epoch": 5.787139689578714, "step": 2610}, {"loss": 1.2094, "grad_norm": 0.9294432401657104, "learning_rate": 0.0002, "epoch": 5.8093126385809315, "step": 2620}, {"loss": 1.3705, "grad_norm": 0.9467836618423462, "learning_rate": 0.0002, "epoch": 5.831485587583149, "step": 2630}, {"loss": 1.334, "grad_norm": 1.1947448253631592, "learning_rate": 0.0002, "epoch": 5.853658536585366, "step": 2640}, {"loss": 1.1952, "grad_norm": 0.9225861430168152, "learning_rate": 0.0002, "epoch": 5.875831485587583, "step": 2650}, {"loss": 1.3356, "grad_norm": 0.9499539136886597, "learning_rate": 0.0002, "epoch": 5.898004434589801, "step": 2660}, {"loss": 1.2898, "grad_norm": 0.9666298031806946, "learning_rate": 0.0002, "epoch": 5.9201773835920175, "step": 2670}, {"loss": 1.1846, "grad_norm": 1.0549718141555786, "learning_rate": 0.0002, "epoch": 5.942350332594235, "step": 2680}, {"loss": 1.2132, "grad_norm": 1.1662505865097046, "learning_rate": 0.0002, "epoch": 5.964523281596453, "step": 2690}, {"loss": 1.2717, "grad_norm": 0.9200838208198547, "learning_rate": 0.0002, "epoch": 5.986696230598669, "step": 2700}, {"eval_loss": 2.089076280593872, "eval_runtime": 95.2405, "eval_samples_per_second": 5.407, "eval_steps_per_second": 0.682, "epoch": 6.0, "step": 2706}, {"loss": 1.2085, "grad_norm": 1.0047595500946045, "learning_rate": 0.0002, "epoch": 6.008869179600887, "step": 2710}, {"loss": 1.075, "grad_norm": 1.5315641164779663, "learning_rate": 0.0002, "epoch": 6.031042128603104, "step": 2720}, {"loss": 1.0955, "grad_norm": 1.2092695236206055, "learning_rate": 0.0002, "epoch": 6.053215077605321, "step": 2730}, {"loss": 1.108, "grad_norm": 1.1834157705307007, "learning_rate": 0.0002, "epoch": 6.075388026607539, "step": 2740}, {"loss": 1.0148, "grad_norm": 1.2534542083740234, "learning_rate": 0.0002, "epoch": 6.097560975609756, "step": 2750}, {"loss": 1.0422, "grad_norm": 1.2898602485656738, "learning_rate": 0.0002, "epoch": 6.119733924611974, "step": 2760}, {"loss": 1.0363, "grad_norm": 1.3397172689437866, "learning_rate": 0.0002, "epoch": 6.14190687361419, "step": 2770}, {"loss": 1.0651, "grad_norm": 1.18838632106781, "learning_rate": 0.0002, "epoch": 6.164079822616408, "step": 2780}, {"loss": 1.048, "grad_norm": 1.2524046897888184, "learning_rate": 0.0002, "epoch": 6.1862527716186255, "step": 2790}, {"loss": 1.0799, "grad_norm": 1.3325964212417603, "learning_rate": 0.0002, "epoch": 6.208425720620842, "step": 2800}, {"loss": 1.0768, "grad_norm": 1.3972342014312744, "learning_rate": 0.0002, "epoch": 6.23059866962306, "step": 2810}, {"loss": 1.0822, "grad_norm": 1.192122220993042, "learning_rate": 0.0002, "epoch": 6.252771618625277, "step": 2820}, {"loss": 1.0274, "grad_norm": 1.2018429040908813, "learning_rate": 0.0002, "epoch": 6.274944567627495, "step": 2830}, {"loss": 1.045, "grad_norm": 1.2017251253128052, "learning_rate": 0.0002, "epoch": 6.2971175166297115, "step": 2840}, {"loss": 1.0522, "grad_norm": 1.070663332939148, "learning_rate": 0.0002, "epoch": 6.319290465631929, "step": 2850}, {"loss": 1.1084, "grad_norm": 1.2376646995544434, "learning_rate": 0.0002, "epoch": 6.341463414634147, "step": 2860}, {"loss": 1.0885, "grad_norm": 1.4164553880691528, "learning_rate": 0.0002, "epoch": 6.363636363636363, "step": 2870}, {"loss": 1.0519, "grad_norm": 0.9863289594650269, "learning_rate": 0.0002, "epoch": 6.385809312638581, "step": 2880}, {"loss": 0.9746, "grad_norm": 1.1530284881591797, "learning_rate": 0.0002, "epoch": 6.407982261640798, "step": 2890}, {"loss": 1.0414, "grad_norm": 1.3614071607589722, "learning_rate": 0.0002, "epoch": 6.430155210643015, "step": 2900}, {"loss": 1.1097, "grad_norm": 1.4213203191757202, "learning_rate": 0.0002, "epoch": 6.452328159645233, "step": 2910}, {"loss": 1.0551, "grad_norm": 1.3584799766540527, "learning_rate": 0.0002, "epoch": 6.47450110864745, "step": 2920}, {"loss": 1.0888, "grad_norm": 1.1774920225143433, "learning_rate": 0.0002, "epoch": 6.496674057649668, "step": 2930}, {"loss": 1.0806, "grad_norm": 1.5063673257827759, "learning_rate": 0.0002, "epoch": 6.518847006651884, "step": 2940}, {"loss": 1.1157, "grad_norm": 1.3073967695236206, "learning_rate": 0.0002, "epoch": 6.541019955654102, "step": 2950}, {"loss": 1.0853, "grad_norm": 1.2877048254013062, "learning_rate": 0.0002, "epoch": 6.5631929046563195, "step": 2960}, {"loss": 1.0518, "grad_norm": 1.4681131839752197, "learning_rate": 0.0002, "epoch": 6.585365853658536, "step": 2970}, {"loss": 1.1336, "grad_norm": 1.364174246788025, "learning_rate": 0.0002, "epoch": 6.607538802660754, "step": 2980}, {"loss": 1.045, "grad_norm": 1.3069559335708618, "learning_rate": 0.0002, "epoch": 6.629711751662971, "step": 2990}, {"loss": 1.059, "grad_norm": 1.152112364768982, "learning_rate": 0.0002, "epoch": 6.651884700665189, "step": 3000}, {"loss": 1.1065, "grad_norm": 1.3854167461395264, "learning_rate": 0.0002, "epoch": 6.674057649667406, "step": 3010}, {"loss": 1.0792, "grad_norm": 1.3519569635391235, "learning_rate": 0.0002, "epoch": 6.696230598669623, "step": 3020}, {"loss": 1.0858, "grad_norm": 1.253912091255188, "learning_rate": 0.0002, "epoch": 6.718403547671841, "step": 3030}, {"loss": 1.0902, "grad_norm": 1.3960589170455933, "learning_rate": 0.0002, "epoch": 6.740576496674057, "step": 3040}, {"loss": 1.1028, "grad_norm": 1.3538455963134766, "learning_rate": 0.0002, "epoch": 6.762749445676275, "step": 3050}, {"loss": 1.1072, "grad_norm": 1.1728484630584717, "learning_rate": 0.0002, "epoch": 6.7849223946784925, "step": 3060}, {"loss": 1.138, "grad_norm": 1.2287765741348267, "learning_rate": 0.0002, "epoch": 6.807095343680709, "step": 3070}, {"loss": 1.0952, "grad_norm": 1.2122321128845215, "learning_rate": 0.0002, "epoch": 6.829268292682927, "step": 3080}, {"loss": 1.1051, "grad_norm": 1.3517614603042603, "learning_rate": 0.0002, "epoch": 6.851441241685144, "step": 3090}, {"loss": 1.1167, "grad_norm": 1.186508059501648, "learning_rate": 0.0002, "epoch": 6.873614190687362, "step": 3100}, {"loss": 1.1307, "grad_norm": 1.2658056020736694, "learning_rate": 0.0002, "epoch": 6.8957871396895785, "step": 3110}, {"loss": 1.0814, "grad_norm": 1.0459643602371216, "learning_rate": 0.0002, "epoch": 6.917960088691796, "step": 3120}, {"loss": 1.0667, "grad_norm": 1.1218708753585815, "learning_rate": 0.0002, "epoch": 6.940133037694014, "step": 3130}, {"loss": 1.0851, "grad_norm": 1.1161539554595947, "learning_rate": 0.0002, "epoch": 6.96230598669623, "step": 3140}, {"loss": 1.1627, "grad_norm": 1.312601923942566, "learning_rate": 0.0002, "epoch": 6.984478935698448, "step": 3150}, {"eval_loss": 2.216700315475464, "eval_runtime": 132.441, "eval_samples_per_second": 3.889, "eval_steps_per_second": 0.491, "epoch": 7.0, "step": 3157}, {"loss": 1.0072, "grad_norm": 1.2042810916900635, "learning_rate": 0.0002, "epoch": 7.006651884700665, "step": 3160}, {"loss": 0.9177, "grad_norm": 1.298388957977295, "learning_rate": 0.0002, "epoch": 7.028824833702883, "step": 3170}, {"loss": 0.8978, "grad_norm": 1.5294439792633057, "learning_rate": 0.0002, "epoch": 7.0509977827051, "step": 3180}, {"loss": 0.8597, "grad_norm": 1.3496054410934448, "learning_rate": 0.0002, "epoch": 7.073170731707317, "step": 3190}, {"loss": 0.8111, "grad_norm": 1.4232285022735596, "learning_rate": 0.0002, "epoch": 7.095343680709535, "step": 3200}, {"loss": 0.9074, "grad_norm": 1.6650644540786743, "learning_rate": 0.0002, "epoch": 7.117516629711751, "step": 3210}, {"loss": 0.9157, "grad_norm": 1.4064364433288574, "learning_rate": 0.0002, "epoch": 7.139689578713969, "step": 3220}, {"loss": 0.9173, "grad_norm": 1.6468620300292969, "learning_rate": 0.0002, "epoch": 7.1618625277161865, "step": 3230}, {"loss": 0.8946, "grad_norm": 1.379271388053894, "learning_rate": 0.0002, "epoch": 7.184035476718403, "step": 3240}, {"loss": 0.8994, "grad_norm": 1.4626420736312866, "learning_rate": 0.0002, "epoch": 7.206208425720621, "step": 3250}, {"loss": 0.9178, "grad_norm": 1.6427521705627441, "learning_rate": 0.0002, "epoch": 7.228381374722838, "step": 3260}, {"loss": 0.9045, "grad_norm": 1.5199066400527954, "learning_rate": 0.0002, "epoch": 7.250554323725056, "step": 3270}, {"loss": 1.0129, "grad_norm": 1.631585717201233, "learning_rate": 0.0002, "epoch": 7.2727272727272725, "step": 3280}, {"loss": 0.95, "grad_norm": 1.5489732027053833, "learning_rate": 0.0002, "epoch": 7.29490022172949, "step": 3290}, {"loss": 0.9094, "grad_norm": 1.2737787961959839, "learning_rate": 0.0002, "epoch": 7.317073170731708, "step": 3300}, {"loss": 0.872, "grad_norm": 1.582791805267334, "learning_rate": 0.0002, "epoch": 7.339246119733924, "step": 3310}, {"loss": 0.9469, "grad_norm": 1.2628211975097656, "learning_rate": 0.0002, "epoch": 7.361419068736142, "step": 3320}, {"loss": 0.9144, "grad_norm": 1.451365351676941, "learning_rate": 0.0002, "epoch": 7.383592017738359, "step": 3330}, {"loss": 0.9293, "grad_norm": 1.5257638692855835, "learning_rate": 0.0002, "epoch": 7.405764966740577, "step": 3340}, {"loss": 0.9539, "grad_norm": 1.2424229383468628, "learning_rate": 0.0002, "epoch": 7.427937915742794, "step": 3350}, {"loss": 0.8657, "grad_norm": 1.503536343574524, "learning_rate": 0.0002, "epoch": 7.450110864745011, "step": 3360}, {"loss": 0.8716, "grad_norm": 1.2467454671859741, "learning_rate": 0.0002, "epoch": 7.472283813747229, "step": 3370}, {"loss": 0.9344, "grad_norm": 1.6118966341018677, "learning_rate": 0.0002, "epoch": 7.494456762749445, "step": 3380}, {"loss": 0.9107, "grad_norm": 1.399969220161438, "learning_rate": 0.0002, "epoch": 7.516629711751663, "step": 3390}, {"loss": 0.9315, "grad_norm": 1.369147777557373, "learning_rate": 0.0002, "epoch": 7.5388026607538805, "step": 3400}, {"loss": 1.0039, "grad_norm": 1.741153359413147, "learning_rate": 0.0002, "epoch": 7.560975609756097, "step": 3410}, {"loss": 0.8504, "grad_norm": 1.436596393585205, "learning_rate": 0.0002, "epoch": 7.583148558758315, "step": 3420}, {"loss": 0.9537, "grad_norm": 1.7102857828140259, "learning_rate": 0.0002, "epoch": 7.605321507760532, "step": 3430}, {"loss": 0.9977, "grad_norm": 1.3728266954421997, "learning_rate": 0.0002, "epoch": 7.627494456762749, "step": 3440}, {"loss": 1.015, "grad_norm": 1.4129058122634888, "learning_rate": 0.0002, "epoch": 7.6496674057649665, "step": 3450}, {"loss": 0.9672, "grad_norm": 1.6068756580352783, "learning_rate": 0.0002, "epoch": 7.671840354767184, "step": 3460}, {"loss": 0.9396, "grad_norm": 1.376522183418274, "learning_rate": 0.0002, "epoch": 7.694013303769402, "step": 3470}, {"loss": 0.9398, "grad_norm": 1.5918605327606201, "learning_rate": 0.0002, "epoch": 7.716186252771618, "step": 3480}, {"loss": 0.885, "grad_norm": 1.3888970613479614, "learning_rate": 0.0002, "epoch": 7.738359201773836, "step": 3490}, {"loss": 0.9283, "grad_norm": 1.3949130773544312, "learning_rate": 0.0002, "epoch": 7.760532150776053, "step": 3500}, {"loss": 0.9194, "grad_norm": 1.6619991064071655, "learning_rate": 0.0002, "epoch": 7.782705099778271, "step": 3510}, {"loss": 0.9956, "grad_norm": 1.6583504676818848, "learning_rate": 0.0002, "epoch": 7.804878048780488, "step": 3520}, {"loss": 0.9093, "grad_norm": 1.5198252201080322, "learning_rate": 0.0002, "epoch": 7.827050997782705, "step": 3530}, {"loss": 0.9916, "grad_norm": 1.5402783155441284, "learning_rate": 0.0002, "epoch": 7.849223946784923, "step": 3540}, {"loss": 0.9848, "grad_norm": 1.358048915863037, "learning_rate": 0.0002, "epoch": 7.871396895787139, "step": 3550}, {"loss": 0.9484, "grad_norm": 1.3957476615905762, "learning_rate": 0.0002, "epoch": 7.893569844789357, "step": 3560}, {"loss": 0.9471, "grad_norm": 1.381712555885315, "learning_rate": 0.0002, "epoch": 7.9157427937915745, "step": 3570}, {"loss": 0.9791, "grad_norm": 1.5783199071884155, "learning_rate": 0.0002, "epoch": 7.937915742793791, "step": 3580}, {"loss": 0.8974, "grad_norm": 1.5801693201065063, "learning_rate": 0.0002, "epoch": 7.960088691796009, "step": 3590}, {"loss": 0.9835, "grad_norm": 1.4844671487808228, "learning_rate": 0.0002, "epoch": 7.982261640798226, "step": 3600}]}