diff --git a/.gitattributes b/.gitattributes index 9b4ea3528cda749bf94d144d011ba57085863564..872ccada48d6c34b059aec052e302070aedb9f49 100644 --- a/.gitattributes +++ b/.gitattributes @@ -623,3 +623,12 @@ Meta-Llama-3-8B-Instruct_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq- Meta-Llama-3-8B-Instruct_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-926-sd-0/checkpoint-752/tokenizer.json filter=lfs diff=lfs merge=lfs -text Meta-Llama-3-8B-Instruct_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-926-sd-0/checkpoint-94/tokenizer.json filter=lfs diff=lfs merge=lfs -text Meta-Llama-3-8B-Instruct_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-926-sd-0/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..89a48e8f3f4598a00630f71918711bed7f28a108 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e013a1c133e2c4b36abf2e4a3ad2e255709b344b3cda721d8cb898dc5d1a8eb8 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2417a50f8178fa4b06c8b31d568b84dbd0ef766c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba6a0072f4cda322bbd33c66728dc7a376035a58f2127c5e8129b23b3655c57e +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ffb419800f6c974c4374943873997266a17c5db --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e636956625cc357a16a4ebbd9c831e638d6007a999d8c9d5fc7066fbb366ff6 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..29632c3448e257d3bf0b24e49280002ac0eb215f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0a94dfc4d4bb76c8e04736d6628984cb5f9a4b51ed6c3fa0f649711038d9aaa +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..57798761841982013a59609024cc9c5c238f2167 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7629012de9a2f741f71ead0fcc6bf0b49375818c4f4b47e0a48313b9a209ae4a +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..33f7a5b2fad8f1b691eb86a56db3f61fcd737195 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/trainer_state.json @@ -0,0 +1,1002 @@ +{ + "best_metric": 1.8250652551651, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900", + "epoch": 3.0, + "eval_steps": 10, + "global_step": 1350, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022222222222222223, + "grad_norm": 0.5109436511993408, + "learning_rate": 0.0002, + "loss": 2.5874, + "step": 10 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 0.4870035946369171, + "learning_rate": 0.0002, + "loss": 2.2952, + "step": 20 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.535464882850647, + "learning_rate": 0.0002, + "loss": 2.0554, + "step": 30 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.49077996611595154, + "learning_rate": 0.0002, + "loss": 2.0067, + "step": 40 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.4671357572078705, + "learning_rate": 0.0002, + "loss": 2.0673, + "step": 50 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.4970313608646393, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 60 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 0.4438260495662689, + "learning_rate": 0.0002, + "loss": 1.9419, + "step": 70 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.5089705586433411, + "learning_rate": 0.0002, + "loss": 1.9856, + "step": 80 + }, + { + "epoch": 0.2, + "grad_norm": 0.4645078182220459, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 90 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.46095192432403564, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 100 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 0.43338075280189514, + "learning_rate": 0.0002, + "loss": 1.891, + "step": 110 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.4433900713920593, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 120 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 0.7018499970436096, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 130 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 0.37056994438171387, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 140 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.40634623169898987, + "learning_rate": 0.0002, + "loss": 1.8059, + "step": 150 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.41917353868484497, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 160 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.42392489314079285, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 170 + }, + { + "epoch": 0.4, + "grad_norm": 0.4281010627746582, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 180 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 0.38542497158050537, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 190 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.36003032326698303, + "learning_rate": 0.0002, + "loss": 1.8951, + "step": 200 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.37858229875564575, + "learning_rate": 0.0002, + "loss": 1.8611, + "step": 210 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 0.49986031651496887, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 220 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 0.3937094807624817, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 230 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4566134512424469, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 240 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.3602476418018341, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 250 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 0.36321184039115906, + "learning_rate": 0.0002, + "loss": 1.7963, + "step": 260 + }, + { + "epoch": 0.6, + "grad_norm": 0.3808199167251587, + "learning_rate": 0.0002, + "loss": 1.9055, + "step": 270 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.38910621404647827, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 280 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 0.31913551688194275, + "learning_rate": 0.0002, + "loss": 1.7958, + "step": 290 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.34734025597572327, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 300 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 0.3517725467681885, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 310 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.3804526627063751, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 320 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.5592505931854248, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 330 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 0.36154472827911377, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 340 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.43970227241516113, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 350 + }, + { + "epoch": 0.8, + "grad_norm": 0.3525223731994629, + "learning_rate": 0.0002, + "loss": 1.8333, + "step": 360 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 0.3706997036933899, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 370 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 0.34138166904449463, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 380 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.4090622365474701, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 390 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.3729974031448364, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 400 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 0.3742152452468872, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 410 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.37685129046440125, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 420 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 430 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.31139856576919556, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 440 + }, + { + "epoch": 1.0, + "grad_norm": 0.3577502965927124, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8310279846191406, + "eval_runtime": 38.8374, + "eval_samples_per_second": 13.26, + "eval_steps_per_second": 1.674, + "step": 450 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 0.3257788419723511, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 460 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 0.3878970146179199, + "learning_rate": 0.0002, + "loss": 1.8031, + "step": 470 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.364427387714386, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 480 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 0.3374682664871216, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 490 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.35822123289108276, + "learning_rate": 0.0002, + "loss": 1.7308, + "step": 500 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.3748345673084259, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 510 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 0.3422437012195587, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 520 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 0.4289326071739197, + "learning_rate": 0.0002, + "loss": 1.7649, + "step": 530 + }, + { + "epoch": 1.2, + "grad_norm": 0.3706769645214081, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 540 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.4024733603000641, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 550 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 0.3960128128528595, + "learning_rate": 0.0002, + "loss": 1.7641, + "step": 560 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.38222864270210266, + "learning_rate": 0.0002, + "loss": 1.755, + "step": 570 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 0.4073713421821594, + "learning_rate": 0.0002, + "loss": 1.735, + "step": 580 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 0.3875499963760376, + "learning_rate": 0.0002, + "loss": 1.7183, + "step": 590 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.39740806818008423, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 600 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.38432490825653076, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 610 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 0.402729868888855, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 620 + }, + { + "epoch": 1.4, + "grad_norm": 0.36683231592178345, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 630 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 0.3883286714553833, + "learning_rate": 0.0002, + "loss": 1.8059, + "step": 640 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4087409973144531, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 650 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.4042017459869385, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 660 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 0.40149256587028503, + "learning_rate": 0.0002, + "loss": 1.7466, + "step": 670 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 0.45146510004997253, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 680 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.4098089039325714, + "learning_rate": 0.0002, + "loss": 1.6923, + "step": 690 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.4181336760520935, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 700 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 1.3722974061965942, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 710 + }, + { + "epoch": 1.6, + "grad_norm": 0.3965230882167816, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 720 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 0.3842000663280487, + "learning_rate": 0.0002, + "loss": 1.7782, + "step": 730 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 0.3603688180446625, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 740 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.39973509311676025, + "learning_rate": 0.0002, + "loss": 1.7476, + "step": 750 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 0.3687385618686676, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 760 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 0.4267722964286804, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 770 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.41301295161247253, + "learning_rate": 0.0002, + "loss": 1.8041, + "step": 780 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 0.3945430517196655, + "learning_rate": 0.0002, + "loss": 1.7284, + "step": 790 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.4037930965423584, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 800 + }, + { + "epoch": 1.8, + "grad_norm": 0.406893253326416, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 810 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 0.4600457549095154, + "learning_rate": 0.0002, + "loss": 1.7628, + "step": 820 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 0.4195384085178375, + "learning_rate": 0.0002, + "loss": 1.8222, + "step": 830 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.3854130506515503, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 840 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.38279038667678833, + "learning_rate": 0.0002, + "loss": 1.7227, + "step": 850 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 0.38249439001083374, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 860 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.42977792024612427, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 870 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 0.4109351933002472, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 880 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 0.3734486699104309, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 890 + }, + { + "epoch": 2.0, + "grad_norm": 0.3603087067604065, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 900 + }, + { + "epoch": 2.0, + "eval_loss": 1.8250652551651, + "eval_runtime": 38.8657, + "eval_samples_per_second": 13.251, + "eval_steps_per_second": 1.672, + "step": 900 + }, + { + "epoch": 2.022222222222222, + "grad_norm": 0.4014144241809845, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 910 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 0.4338063597679138, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 920 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 0.3693605065345764, + "learning_rate": 0.0002, + "loss": 1.6354, + "step": 930 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 0.4040255844593048, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 940 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.43481820821762085, + "learning_rate": 0.0002, + "loss": 1.5976, + "step": 950 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.41632869839668274, + "learning_rate": 0.0002, + "loss": 1.6081, + "step": 960 + }, + { + "epoch": 2.1555555555555554, + "grad_norm": 0.4633755385875702, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 970 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 0.43926581740379333, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 980 + }, + { + "epoch": 2.2, + "grad_norm": 0.4757233262062073, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 990 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5010586977005005, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 1000 + }, + { + "epoch": 2.2444444444444445, + "grad_norm": 0.44900986552238464, + "learning_rate": 0.0002, + "loss": 1.6454, + "step": 1010 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.41274750232696533, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 1020 + }, + { + "epoch": 2.2888888888888888, + "grad_norm": 0.44672393798828125, + "learning_rate": 0.0002, + "loss": 1.6592, + "step": 1030 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 0.4826269745826721, + "learning_rate": 0.0002, + "loss": 1.6349, + "step": 1040 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.4650685489177704, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 1050 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 0.42507848143577576, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 1060 + }, + { + "epoch": 2.3777777777777778, + "grad_norm": 0.45653030276298523, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1070 + }, + { + "epoch": 2.4, + "grad_norm": 0.44534122943878174, + "learning_rate": 0.0002, + "loss": 1.6469, + "step": 1080 + }, + { + "epoch": 2.422222222222222, + "grad_norm": 0.4241289794445038, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 1090 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.5004808306694031, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 1100 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 0.41425490379333496, + "learning_rate": 0.0002, + "loss": 1.6833, + "step": 1110 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 0.44362279772758484, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1120 + }, + { + "epoch": 2.511111111111111, + "grad_norm": 0.5530985593795776, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 1130 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.4290637969970703, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1140 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.4957487881183624, + "learning_rate": 0.0002, + "loss": 1.76, + "step": 1150 + }, + { + "epoch": 2.5777777777777775, + "grad_norm": 0.5082747340202332, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 1160 + }, + { + "epoch": 2.6, + "grad_norm": 0.478722482919693, + "learning_rate": 0.0002, + "loss": 1.6702, + "step": 1170 + }, + { + "epoch": 2.6222222222222222, + "grad_norm": 0.436454176902771, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 1180 + }, + { + "epoch": 2.6444444444444444, + "grad_norm": 0.4905032515525818, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 1190 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.4815700054168701, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 1200 + }, + { + "epoch": 2.688888888888889, + "grad_norm": 0.3965534269809723, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 1210 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 0.43282169103622437, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 1220 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 0.45512479543685913, + "learning_rate": 0.0002, + "loss": 1.6455, + "step": 1230 + }, + { + "epoch": 2.7555555555555555, + "grad_norm": 0.44370076060295105, + "learning_rate": 0.0002, + "loss": 1.6444, + "step": 1240 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.4750686287879944, + "learning_rate": 0.0002, + "loss": 1.7106, + "step": 1250 + }, + { + "epoch": 2.8, + "grad_norm": 0.41953766345977783, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 1260 + }, + { + "epoch": 2.822222222222222, + "grad_norm": 0.4887140095233917, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 1270 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 0.46718958020210266, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 1280 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 0.48510900139808655, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 1290 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.4504084289073944, + "learning_rate": 0.0002, + "loss": 1.6825, + "step": 1300 + }, + { + "epoch": 2.911111111111111, + "grad_norm": 0.42119622230529785, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 1310 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 0.4763694107532501, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 1320 + }, + { + "epoch": 2.9555555555555557, + "grad_norm": 0.422810822725296, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1330 + }, + { + "epoch": 2.977777777777778, + "grad_norm": 0.4768871068954468, + "learning_rate": 0.0002, + "loss": 1.6578, + "step": 1340 + }, + { + "epoch": 3.0, + "grad_norm": 0.48259881138801575, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 1350 + }, + { + "epoch": 3.0, + "eval_loss": 1.8452560901641846, + "eval_runtime": 38.8621, + "eval_samples_per_second": 13.252, + "eval_steps_per_second": 1.673, + "step": 1350 + } + ], + "logging_steps": 10, + "max_steps": 3600, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.24749772275712e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08bcb50b9f534803ed2a4a4be696b74ae373982a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1350/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:333a3e3301276a753d48f5fabf4c169b8961c6e6ef7b7a7f931ba463d72983a4 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a4704722fdc93d77785ba57ade422f95805c1939 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6521d52360c5387386406cd2995000343da35c4346a883dae3c7fd82f4b06d3e +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..933ef673bf20a1564b1736a1904fb43399529041 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56fd0aab3c9b2142f6e80786b4f41f11b9ba45440b5d647666f4420ea9a8aa73 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7c47158a9bb9de49f113026d8c52db015b42e4dc --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c45175c3b36f2bcb978142faddb342458ec190536e3875c82b8553012495a6a4 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f0445f0924bc70525355a65ac2f74d032ee68a9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7dbbce861109f81365344312e20f507958abb05866e4ce9eda5aaa9a91c48b5 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bb9ca8badd01501523f73b670fb05516ea00e9d3 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/trainer_state.json @@ -0,0 +1,1325 @@ +{ + "best_metric": 1.8250652551651, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 1800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022222222222222223, + "grad_norm": 0.5109436511993408, + "learning_rate": 0.0002, + "loss": 2.5874, + "step": 10 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 0.4870035946369171, + "learning_rate": 0.0002, + "loss": 2.2952, + "step": 20 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.535464882850647, + "learning_rate": 0.0002, + "loss": 2.0554, + "step": 30 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.49077996611595154, + "learning_rate": 0.0002, + "loss": 2.0067, + "step": 40 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.4671357572078705, + "learning_rate": 0.0002, + "loss": 2.0673, + "step": 50 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.4970313608646393, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 60 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 0.4438260495662689, + "learning_rate": 0.0002, + "loss": 1.9419, + "step": 70 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.5089705586433411, + "learning_rate": 0.0002, + "loss": 1.9856, + "step": 80 + }, + { + "epoch": 0.2, + "grad_norm": 0.4645078182220459, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 90 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.46095192432403564, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 100 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 0.43338075280189514, + "learning_rate": 0.0002, + "loss": 1.891, + "step": 110 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.4433900713920593, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 120 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 0.7018499970436096, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 130 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 0.37056994438171387, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 140 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.40634623169898987, + "learning_rate": 0.0002, + "loss": 1.8059, + "step": 150 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.41917353868484497, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 160 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.42392489314079285, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 170 + }, + { + "epoch": 0.4, + "grad_norm": 0.4281010627746582, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 180 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 0.38542497158050537, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 190 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.36003032326698303, + "learning_rate": 0.0002, + "loss": 1.8951, + "step": 200 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.37858229875564575, + "learning_rate": 0.0002, + "loss": 1.8611, + "step": 210 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 0.49986031651496887, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 220 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 0.3937094807624817, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 230 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4566134512424469, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 240 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.3602476418018341, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 250 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 0.36321184039115906, + "learning_rate": 0.0002, + "loss": 1.7963, + "step": 260 + }, + { + "epoch": 0.6, + "grad_norm": 0.3808199167251587, + "learning_rate": 0.0002, + "loss": 1.9055, + "step": 270 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.38910621404647827, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 280 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 0.31913551688194275, + "learning_rate": 0.0002, + "loss": 1.7958, + "step": 290 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.34734025597572327, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 300 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 0.3517725467681885, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 310 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.3804526627063751, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 320 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.5592505931854248, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 330 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 0.36154472827911377, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 340 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.43970227241516113, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 350 + }, + { + "epoch": 0.8, + "grad_norm": 0.3525223731994629, + "learning_rate": 0.0002, + "loss": 1.8333, + "step": 360 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 0.3706997036933899, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 370 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 0.34138166904449463, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 380 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.4090622365474701, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 390 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.3729974031448364, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 400 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 0.3742152452468872, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 410 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.37685129046440125, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 420 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 430 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.31139856576919556, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 440 + }, + { + "epoch": 1.0, + "grad_norm": 0.3577502965927124, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8310279846191406, + "eval_runtime": 38.8374, + "eval_samples_per_second": 13.26, + "eval_steps_per_second": 1.674, + "step": 450 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 0.3257788419723511, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 460 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 0.3878970146179199, + "learning_rate": 0.0002, + "loss": 1.8031, + "step": 470 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.364427387714386, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 480 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 0.3374682664871216, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 490 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.35822123289108276, + "learning_rate": 0.0002, + "loss": 1.7308, + "step": 500 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.3748345673084259, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 510 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 0.3422437012195587, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 520 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 0.4289326071739197, + "learning_rate": 0.0002, + "loss": 1.7649, + "step": 530 + }, + { + "epoch": 1.2, + "grad_norm": 0.3706769645214081, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 540 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.4024733603000641, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 550 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 0.3960128128528595, + "learning_rate": 0.0002, + "loss": 1.7641, + "step": 560 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.38222864270210266, + "learning_rate": 0.0002, + "loss": 1.755, + "step": 570 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 0.4073713421821594, + "learning_rate": 0.0002, + "loss": 1.735, + "step": 580 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 0.3875499963760376, + "learning_rate": 0.0002, + "loss": 1.7183, + "step": 590 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.39740806818008423, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 600 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.38432490825653076, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 610 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 0.402729868888855, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 620 + }, + { + "epoch": 1.4, + "grad_norm": 0.36683231592178345, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 630 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 0.3883286714553833, + "learning_rate": 0.0002, + "loss": 1.8059, + "step": 640 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4087409973144531, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 650 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.4042017459869385, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 660 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 0.40149256587028503, + "learning_rate": 0.0002, + "loss": 1.7466, + "step": 670 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 0.45146510004997253, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 680 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.4098089039325714, + "learning_rate": 0.0002, + "loss": 1.6923, + "step": 690 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.4181336760520935, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 700 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 1.3722974061965942, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 710 + }, + { + "epoch": 1.6, + "grad_norm": 0.3965230882167816, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 720 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 0.3842000663280487, + "learning_rate": 0.0002, + "loss": 1.7782, + "step": 730 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 0.3603688180446625, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 740 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.39973509311676025, + "learning_rate": 0.0002, + "loss": 1.7476, + "step": 750 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 0.3687385618686676, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 760 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 0.4267722964286804, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 770 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.41301295161247253, + "learning_rate": 0.0002, + "loss": 1.8041, + "step": 780 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 0.3945430517196655, + "learning_rate": 0.0002, + "loss": 1.7284, + "step": 790 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.4037930965423584, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 800 + }, + { + "epoch": 1.8, + "grad_norm": 0.406893253326416, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 810 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 0.4600457549095154, + "learning_rate": 0.0002, + "loss": 1.7628, + "step": 820 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 0.4195384085178375, + "learning_rate": 0.0002, + "loss": 1.8222, + "step": 830 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.3854130506515503, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 840 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.38279038667678833, + "learning_rate": 0.0002, + "loss": 1.7227, + "step": 850 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 0.38249439001083374, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 860 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.42977792024612427, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 870 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 0.4109351933002472, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 880 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 0.3734486699104309, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 890 + }, + { + "epoch": 2.0, + "grad_norm": 0.3603087067604065, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 900 + }, + { + "epoch": 2.0, + "eval_loss": 1.8250652551651, + "eval_runtime": 38.8657, + "eval_samples_per_second": 13.251, + "eval_steps_per_second": 1.672, + "step": 900 + }, + { + "epoch": 2.022222222222222, + "grad_norm": 0.4014144241809845, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 910 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 0.4338063597679138, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 920 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 0.3693605065345764, + "learning_rate": 0.0002, + "loss": 1.6354, + "step": 930 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 0.4040255844593048, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 940 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.43481820821762085, + "learning_rate": 0.0002, + "loss": 1.5976, + "step": 950 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.41632869839668274, + "learning_rate": 0.0002, + "loss": 1.6081, + "step": 960 + }, + { + "epoch": 2.1555555555555554, + "grad_norm": 0.4633755385875702, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 970 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 0.43926581740379333, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 980 + }, + { + "epoch": 2.2, + "grad_norm": 0.4757233262062073, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 990 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5010586977005005, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 1000 + }, + { + "epoch": 2.2444444444444445, + "grad_norm": 0.44900986552238464, + "learning_rate": 0.0002, + "loss": 1.6454, + "step": 1010 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.41274750232696533, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 1020 + }, + { + "epoch": 2.2888888888888888, + "grad_norm": 0.44672393798828125, + "learning_rate": 0.0002, + "loss": 1.6592, + "step": 1030 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 0.4826269745826721, + "learning_rate": 0.0002, + "loss": 1.6349, + "step": 1040 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.4650685489177704, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 1050 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 0.42507848143577576, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 1060 + }, + { + "epoch": 2.3777777777777778, + "grad_norm": 0.45653030276298523, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1070 + }, + { + "epoch": 2.4, + "grad_norm": 0.44534122943878174, + "learning_rate": 0.0002, + "loss": 1.6469, + "step": 1080 + }, + { + "epoch": 2.422222222222222, + "grad_norm": 0.4241289794445038, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 1090 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.5004808306694031, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 1100 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 0.41425490379333496, + "learning_rate": 0.0002, + "loss": 1.6833, + "step": 1110 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 0.44362279772758484, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1120 + }, + { + "epoch": 2.511111111111111, + "grad_norm": 0.5530985593795776, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 1130 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.4290637969970703, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1140 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.4957487881183624, + "learning_rate": 0.0002, + "loss": 1.76, + "step": 1150 + }, + { + "epoch": 2.5777777777777775, + "grad_norm": 0.5082747340202332, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 1160 + }, + { + "epoch": 2.6, + "grad_norm": 0.478722482919693, + "learning_rate": 0.0002, + "loss": 1.6702, + "step": 1170 + }, + { + "epoch": 2.6222222222222222, + "grad_norm": 0.436454176902771, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 1180 + }, + { + "epoch": 2.6444444444444444, + "grad_norm": 0.4905032515525818, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 1190 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.4815700054168701, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 1200 + }, + { + "epoch": 2.688888888888889, + "grad_norm": 0.3965534269809723, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 1210 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 0.43282169103622437, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 1220 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 0.45512479543685913, + "learning_rate": 0.0002, + "loss": 1.6455, + "step": 1230 + }, + { + "epoch": 2.7555555555555555, + "grad_norm": 0.44370076060295105, + "learning_rate": 0.0002, + "loss": 1.6444, + "step": 1240 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.4750686287879944, + "learning_rate": 0.0002, + "loss": 1.7106, + "step": 1250 + }, + { + "epoch": 2.8, + "grad_norm": 0.41953766345977783, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 1260 + }, + { + "epoch": 2.822222222222222, + "grad_norm": 0.4887140095233917, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 1270 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 0.46718958020210266, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 1280 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 0.48510900139808655, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 1290 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.4504084289073944, + "learning_rate": 0.0002, + "loss": 1.6825, + "step": 1300 + }, + { + "epoch": 2.911111111111111, + "grad_norm": 0.42119622230529785, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 1310 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 0.4763694107532501, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 1320 + }, + { + "epoch": 2.9555555555555557, + "grad_norm": 0.422810822725296, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1330 + }, + { + "epoch": 2.977777777777778, + "grad_norm": 0.4768871068954468, + "learning_rate": 0.0002, + "loss": 1.6578, + "step": 1340 + }, + { + "epoch": 3.0, + "grad_norm": 0.48259881138801575, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 1350 + }, + { + "epoch": 3.0, + "eval_loss": 1.8452560901641846, + "eval_runtime": 38.8621, + "eval_samples_per_second": 13.252, + "eval_steps_per_second": 1.673, + "step": 1350 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 0.6933313012123108, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 1360 + }, + { + "epoch": 3.0444444444444443, + "grad_norm": 0.5870710611343384, + "learning_rate": 0.0002, + "loss": 1.5542, + "step": 1370 + }, + { + "epoch": 3.066666666666667, + "grad_norm": 0.602210283279419, + "learning_rate": 0.0002, + "loss": 1.511, + "step": 1380 + }, + { + "epoch": 3.088888888888889, + "grad_norm": 0.6461787819862366, + "learning_rate": 0.0002, + "loss": 1.5272, + "step": 1390 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.5839587450027466, + "learning_rate": 0.0002, + "loss": 1.4813, + "step": 1400 + }, + { + "epoch": 3.1333333333333333, + "grad_norm": 0.5757876038551331, + "learning_rate": 0.0002, + "loss": 1.505, + "step": 1410 + }, + { + "epoch": 3.1555555555555554, + "grad_norm": 0.5862616300582886, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 1420 + }, + { + "epoch": 3.1777777777777776, + "grad_norm": 0.6103630065917969, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 1430 + }, + { + "epoch": 3.2, + "grad_norm": 0.9309254884719849, + "learning_rate": 0.0002, + "loss": 1.5406, + "step": 1440 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.5360018014907837, + "learning_rate": 0.0002, + "loss": 1.487, + "step": 1450 + }, + { + "epoch": 3.2444444444444445, + "grad_norm": 0.5448758602142334, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 1460 + }, + { + "epoch": 3.2666666666666666, + "grad_norm": 0.5973812341690063, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 1470 + }, + { + "epoch": 3.2888888888888888, + "grad_norm": 0.6245622038841248, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 1480 + }, + { + "epoch": 3.311111111111111, + "grad_norm": 0.6533768773078918, + "learning_rate": 0.0002, + "loss": 1.4795, + "step": 1490 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.5765811204910278, + "learning_rate": 0.0002, + "loss": 1.5562, + "step": 1500 + }, + { + "epoch": 3.3555555555555556, + "grad_norm": 0.591395378112793, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 1510 + }, + { + "epoch": 3.3777777777777778, + "grad_norm": 0.5842425227165222, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 1520 + }, + { + "epoch": 3.4, + "grad_norm": 0.5731365084648132, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 1530 + }, + { + "epoch": 3.422222222222222, + "grad_norm": 0.5841306447982788, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 1540 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.6503536701202393, + "learning_rate": 0.0002, + "loss": 1.4922, + "step": 1550 + }, + { + "epoch": 3.466666666666667, + "grad_norm": 0.6170967221260071, + "learning_rate": 0.0002, + "loss": 1.5493, + "step": 1560 + }, + { + "epoch": 3.488888888888889, + "grad_norm": 0.5576487183570862, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 1570 + }, + { + "epoch": 3.511111111111111, + "grad_norm": 0.7082911133766174, + "learning_rate": 0.0002, + "loss": 1.472, + "step": 1580 + }, + { + "epoch": 3.533333333333333, + "grad_norm": 0.6159376502037048, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 1590 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.5972959399223328, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 1600 + }, + { + "epoch": 3.5777777777777775, + "grad_norm": 0.5787310004234314, + "learning_rate": 0.0002, + "loss": 1.4876, + "step": 1610 + }, + { + "epoch": 3.6, + "grad_norm": 0.5846341252326965, + "learning_rate": 0.0002, + "loss": 1.4887, + "step": 1620 + }, + { + "epoch": 3.6222222222222222, + "grad_norm": 0.5906197428703308, + "learning_rate": 0.0002, + "loss": 1.542, + "step": 1630 + }, + { + "epoch": 3.6444444444444444, + "grad_norm": 0.6305760145187378, + "learning_rate": 0.0002, + "loss": 1.4941, + "step": 1640 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.7448979616165161, + "learning_rate": 0.0002, + "loss": 1.4677, + "step": 1650 + }, + { + "epoch": 3.688888888888889, + "grad_norm": 0.5906165242195129, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 1660 + }, + { + "epoch": 3.7111111111111112, + "grad_norm": 0.605032742023468, + "learning_rate": 0.0002, + "loss": 1.4882, + "step": 1670 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 0.6117229461669922, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 1680 + }, + { + "epoch": 3.7555555555555555, + "grad_norm": 0.613581120967865, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 1690 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.6244436502456665, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 1700 + }, + { + "epoch": 3.8, + "grad_norm": 0.6236702799797058, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 1710 + }, + { + "epoch": 3.822222222222222, + "grad_norm": 0.639141857624054, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 1720 + }, + { + "epoch": 3.8444444444444446, + "grad_norm": 0.5782344937324524, + "learning_rate": 0.0002, + "loss": 1.536, + "step": 1730 + }, + { + "epoch": 3.8666666666666667, + "grad_norm": 0.5952938795089722, + "learning_rate": 0.0002, + "loss": 1.5355, + "step": 1740 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.5573042035102844, + "learning_rate": 0.0002, + "loss": 1.5205, + "step": 1750 + }, + { + "epoch": 3.911111111111111, + "grad_norm": 0.6114351749420166, + "learning_rate": 0.0002, + "loss": 1.5066, + "step": 1760 + }, + { + "epoch": 3.9333333333333336, + "grad_norm": 0.5973817110061646, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 1770 + }, + { + "epoch": 3.9555555555555557, + "grad_norm": 0.602317750453949, + "learning_rate": 0.0002, + "loss": 1.5003, + "step": 1780 + }, + { + "epoch": 3.977777777777778, + "grad_norm": 0.5965437293052673, + "learning_rate": 0.0002, + "loss": 1.5022, + "step": 1790 + }, + { + "epoch": 4.0, + "grad_norm": 0.5641552209854126, + "learning_rate": 0.0002, + "loss": 1.5031, + "step": 1800 + }, + { + "epoch": 4.0, + "eval_loss": 1.892098069190979, + "eval_runtime": 38.8755, + "eval_samples_per_second": 13.247, + "eval_steps_per_second": 1.672, + "step": 1800 + } + ], + "logging_steps": 10, + "max_steps": 3600, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.32999696367616e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08bcb50b9f534803ed2a4a4be696b74ae373982a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-1800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:333a3e3301276a753d48f5fabf4c169b8961c6e6ef7b7a7f931ba463d72983a4 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..112ac34fff5b5e2eb4cd44669dbfc2cae5c96966 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62f0e46312ce973ebe85bf4da57e7f5365604c45499392d82d35d73bec9da467 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e190228f38140caed94e82f5e931e5e90fc92b0a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8809a64a39c3fcc2d69f2bd3c5d57b86a77d7feaa5373e84388e030ee66748c1 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1dcc8e434c066706310be3620423de813acd831b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f0e6eb1fae3ace7af28189ce7c74619fdbaaef41229ff767cf571de84a60cbb +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1dce18b7c5e46565177df7a55995d21d8d07035c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec90720690d9bc72c277d37c452ce5a3a646d1879ecf2a784559bc30250d08df +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d56302d1c039e88bd38480b44293b160b925e05b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/trainer_state.json @@ -0,0 +1,1648 @@ +{ + "best_metric": 1.8250652551651, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900", + "epoch": 5.0, + "eval_steps": 10, + "global_step": 2250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022222222222222223, + "grad_norm": 0.5109436511993408, + "learning_rate": 0.0002, + "loss": 2.5874, + "step": 10 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 0.4870035946369171, + "learning_rate": 0.0002, + "loss": 2.2952, + "step": 20 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.535464882850647, + "learning_rate": 0.0002, + "loss": 2.0554, + "step": 30 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.49077996611595154, + "learning_rate": 0.0002, + "loss": 2.0067, + "step": 40 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.4671357572078705, + "learning_rate": 0.0002, + "loss": 2.0673, + "step": 50 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.4970313608646393, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 60 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 0.4438260495662689, + "learning_rate": 0.0002, + "loss": 1.9419, + "step": 70 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.5089705586433411, + "learning_rate": 0.0002, + "loss": 1.9856, + "step": 80 + }, + { + "epoch": 0.2, + "grad_norm": 0.4645078182220459, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 90 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.46095192432403564, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 100 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 0.43338075280189514, + "learning_rate": 0.0002, + "loss": 1.891, + "step": 110 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.4433900713920593, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 120 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 0.7018499970436096, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 130 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 0.37056994438171387, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 140 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.40634623169898987, + "learning_rate": 0.0002, + "loss": 1.8059, + "step": 150 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.41917353868484497, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 160 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.42392489314079285, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 170 + }, + { + "epoch": 0.4, + "grad_norm": 0.4281010627746582, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 180 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 0.38542497158050537, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 190 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.36003032326698303, + "learning_rate": 0.0002, + "loss": 1.8951, + "step": 200 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.37858229875564575, + "learning_rate": 0.0002, + "loss": 1.8611, + "step": 210 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 0.49986031651496887, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 220 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 0.3937094807624817, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 230 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4566134512424469, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 240 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.3602476418018341, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 250 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 0.36321184039115906, + "learning_rate": 0.0002, + "loss": 1.7963, + "step": 260 + }, + { + "epoch": 0.6, + "grad_norm": 0.3808199167251587, + "learning_rate": 0.0002, + "loss": 1.9055, + "step": 270 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.38910621404647827, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 280 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 0.31913551688194275, + "learning_rate": 0.0002, + "loss": 1.7958, + "step": 290 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.34734025597572327, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 300 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 0.3517725467681885, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 310 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.3804526627063751, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 320 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.5592505931854248, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 330 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 0.36154472827911377, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 340 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.43970227241516113, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 350 + }, + { + "epoch": 0.8, + "grad_norm": 0.3525223731994629, + "learning_rate": 0.0002, + "loss": 1.8333, + "step": 360 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 0.3706997036933899, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 370 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 0.34138166904449463, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 380 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.4090622365474701, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 390 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.3729974031448364, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 400 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 0.3742152452468872, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 410 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.37685129046440125, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 420 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 430 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.31139856576919556, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 440 + }, + { + "epoch": 1.0, + "grad_norm": 0.3577502965927124, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8310279846191406, + "eval_runtime": 38.8374, + "eval_samples_per_second": 13.26, + "eval_steps_per_second": 1.674, + "step": 450 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 0.3257788419723511, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 460 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 0.3878970146179199, + "learning_rate": 0.0002, + "loss": 1.8031, + "step": 470 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.364427387714386, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 480 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 0.3374682664871216, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 490 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.35822123289108276, + "learning_rate": 0.0002, + "loss": 1.7308, + "step": 500 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.3748345673084259, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 510 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 0.3422437012195587, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 520 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 0.4289326071739197, + "learning_rate": 0.0002, + "loss": 1.7649, + "step": 530 + }, + { + "epoch": 1.2, + "grad_norm": 0.3706769645214081, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 540 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.4024733603000641, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 550 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 0.3960128128528595, + "learning_rate": 0.0002, + "loss": 1.7641, + "step": 560 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.38222864270210266, + "learning_rate": 0.0002, + "loss": 1.755, + "step": 570 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 0.4073713421821594, + "learning_rate": 0.0002, + "loss": 1.735, + "step": 580 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 0.3875499963760376, + "learning_rate": 0.0002, + "loss": 1.7183, + "step": 590 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.39740806818008423, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 600 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.38432490825653076, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 610 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 0.402729868888855, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 620 + }, + { + "epoch": 1.4, + "grad_norm": 0.36683231592178345, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 630 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 0.3883286714553833, + "learning_rate": 0.0002, + "loss": 1.8059, + "step": 640 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4087409973144531, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 650 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.4042017459869385, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 660 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 0.40149256587028503, + "learning_rate": 0.0002, + "loss": 1.7466, + "step": 670 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 0.45146510004997253, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 680 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.4098089039325714, + "learning_rate": 0.0002, + "loss": 1.6923, + "step": 690 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.4181336760520935, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 700 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 1.3722974061965942, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 710 + }, + { + "epoch": 1.6, + "grad_norm": 0.3965230882167816, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 720 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 0.3842000663280487, + "learning_rate": 0.0002, + "loss": 1.7782, + "step": 730 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 0.3603688180446625, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 740 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.39973509311676025, + "learning_rate": 0.0002, + "loss": 1.7476, + "step": 750 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 0.3687385618686676, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 760 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 0.4267722964286804, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 770 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.41301295161247253, + "learning_rate": 0.0002, + "loss": 1.8041, + "step": 780 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 0.3945430517196655, + "learning_rate": 0.0002, + "loss": 1.7284, + "step": 790 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.4037930965423584, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 800 + }, + { + "epoch": 1.8, + "grad_norm": 0.406893253326416, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 810 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 0.4600457549095154, + "learning_rate": 0.0002, + "loss": 1.7628, + "step": 820 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 0.4195384085178375, + "learning_rate": 0.0002, + "loss": 1.8222, + "step": 830 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.3854130506515503, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 840 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.38279038667678833, + "learning_rate": 0.0002, + "loss": 1.7227, + "step": 850 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 0.38249439001083374, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 860 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.42977792024612427, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 870 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 0.4109351933002472, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 880 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 0.3734486699104309, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 890 + }, + { + "epoch": 2.0, + "grad_norm": 0.3603087067604065, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 900 + }, + { + "epoch": 2.0, + "eval_loss": 1.8250652551651, + "eval_runtime": 38.8657, + "eval_samples_per_second": 13.251, + "eval_steps_per_second": 1.672, + "step": 900 + }, + { + "epoch": 2.022222222222222, + "grad_norm": 0.4014144241809845, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 910 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 0.4338063597679138, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 920 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 0.3693605065345764, + "learning_rate": 0.0002, + "loss": 1.6354, + "step": 930 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 0.4040255844593048, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 940 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.43481820821762085, + "learning_rate": 0.0002, + "loss": 1.5976, + "step": 950 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.41632869839668274, + "learning_rate": 0.0002, + "loss": 1.6081, + "step": 960 + }, + { + "epoch": 2.1555555555555554, + "grad_norm": 0.4633755385875702, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 970 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 0.43926581740379333, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 980 + }, + { + "epoch": 2.2, + "grad_norm": 0.4757233262062073, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 990 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5010586977005005, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 1000 + }, + { + "epoch": 2.2444444444444445, + "grad_norm": 0.44900986552238464, + "learning_rate": 0.0002, + "loss": 1.6454, + "step": 1010 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.41274750232696533, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 1020 + }, + { + "epoch": 2.2888888888888888, + "grad_norm": 0.44672393798828125, + "learning_rate": 0.0002, + "loss": 1.6592, + "step": 1030 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 0.4826269745826721, + "learning_rate": 0.0002, + "loss": 1.6349, + "step": 1040 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.4650685489177704, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 1050 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 0.42507848143577576, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 1060 + }, + { + "epoch": 2.3777777777777778, + "grad_norm": 0.45653030276298523, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1070 + }, + { + "epoch": 2.4, + "grad_norm": 0.44534122943878174, + "learning_rate": 0.0002, + "loss": 1.6469, + "step": 1080 + }, + { + "epoch": 2.422222222222222, + "grad_norm": 0.4241289794445038, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 1090 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.5004808306694031, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 1100 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 0.41425490379333496, + "learning_rate": 0.0002, + "loss": 1.6833, + "step": 1110 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 0.44362279772758484, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1120 + }, + { + "epoch": 2.511111111111111, + "grad_norm": 0.5530985593795776, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 1130 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.4290637969970703, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1140 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.4957487881183624, + "learning_rate": 0.0002, + "loss": 1.76, + "step": 1150 + }, + { + "epoch": 2.5777777777777775, + "grad_norm": 0.5082747340202332, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 1160 + }, + { + "epoch": 2.6, + "grad_norm": 0.478722482919693, + "learning_rate": 0.0002, + "loss": 1.6702, + "step": 1170 + }, + { + "epoch": 2.6222222222222222, + "grad_norm": 0.436454176902771, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 1180 + }, + { + "epoch": 2.6444444444444444, + "grad_norm": 0.4905032515525818, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 1190 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.4815700054168701, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 1200 + }, + { + "epoch": 2.688888888888889, + "grad_norm": 0.3965534269809723, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 1210 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 0.43282169103622437, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 1220 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 0.45512479543685913, + "learning_rate": 0.0002, + "loss": 1.6455, + "step": 1230 + }, + { + "epoch": 2.7555555555555555, + "grad_norm": 0.44370076060295105, + "learning_rate": 0.0002, + "loss": 1.6444, + "step": 1240 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.4750686287879944, + "learning_rate": 0.0002, + "loss": 1.7106, + "step": 1250 + }, + { + "epoch": 2.8, + "grad_norm": 0.41953766345977783, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 1260 + }, + { + "epoch": 2.822222222222222, + "grad_norm": 0.4887140095233917, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 1270 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 0.46718958020210266, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 1280 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 0.48510900139808655, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 1290 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.4504084289073944, + "learning_rate": 0.0002, + "loss": 1.6825, + "step": 1300 + }, + { + "epoch": 2.911111111111111, + "grad_norm": 0.42119622230529785, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 1310 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 0.4763694107532501, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 1320 + }, + { + "epoch": 2.9555555555555557, + "grad_norm": 0.422810822725296, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1330 + }, + { + "epoch": 2.977777777777778, + "grad_norm": 0.4768871068954468, + "learning_rate": 0.0002, + "loss": 1.6578, + "step": 1340 + }, + { + "epoch": 3.0, + "grad_norm": 0.48259881138801575, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 1350 + }, + { + "epoch": 3.0, + "eval_loss": 1.8452560901641846, + "eval_runtime": 38.8621, + "eval_samples_per_second": 13.252, + "eval_steps_per_second": 1.673, + "step": 1350 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 0.6933313012123108, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 1360 + }, + { + "epoch": 3.0444444444444443, + "grad_norm": 0.5870710611343384, + "learning_rate": 0.0002, + "loss": 1.5542, + "step": 1370 + }, + { + "epoch": 3.066666666666667, + "grad_norm": 0.602210283279419, + "learning_rate": 0.0002, + "loss": 1.511, + "step": 1380 + }, + { + "epoch": 3.088888888888889, + "grad_norm": 0.6461787819862366, + "learning_rate": 0.0002, + "loss": 1.5272, + "step": 1390 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.5839587450027466, + "learning_rate": 0.0002, + "loss": 1.4813, + "step": 1400 + }, + { + "epoch": 3.1333333333333333, + "grad_norm": 0.5757876038551331, + "learning_rate": 0.0002, + "loss": 1.505, + "step": 1410 + }, + { + "epoch": 3.1555555555555554, + "grad_norm": 0.5862616300582886, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 1420 + }, + { + "epoch": 3.1777777777777776, + "grad_norm": 0.6103630065917969, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 1430 + }, + { + "epoch": 3.2, + "grad_norm": 0.9309254884719849, + "learning_rate": 0.0002, + "loss": 1.5406, + "step": 1440 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.5360018014907837, + "learning_rate": 0.0002, + "loss": 1.487, + "step": 1450 + }, + { + "epoch": 3.2444444444444445, + "grad_norm": 0.5448758602142334, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 1460 + }, + { + "epoch": 3.2666666666666666, + "grad_norm": 0.5973812341690063, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 1470 + }, + { + "epoch": 3.2888888888888888, + "grad_norm": 0.6245622038841248, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 1480 + }, + { + "epoch": 3.311111111111111, + "grad_norm": 0.6533768773078918, + "learning_rate": 0.0002, + "loss": 1.4795, + "step": 1490 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.5765811204910278, + "learning_rate": 0.0002, + "loss": 1.5562, + "step": 1500 + }, + { + "epoch": 3.3555555555555556, + "grad_norm": 0.591395378112793, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 1510 + }, + { + "epoch": 3.3777777777777778, + "grad_norm": 0.5842425227165222, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 1520 + }, + { + "epoch": 3.4, + "grad_norm": 0.5731365084648132, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 1530 + }, + { + "epoch": 3.422222222222222, + "grad_norm": 0.5841306447982788, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 1540 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.6503536701202393, + "learning_rate": 0.0002, + "loss": 1.4922, + "step": 1550 + }, + { + "epoch": 3.466666666666667, + "grad_norm": 0.6170967221260071, + "learning_rate": 0.0002, + "loss": 1.5493, + "step": 1560 + }, + { + "epoch": 3.488888888888889, + "grad_norm": 0.5576487183570862, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 1570 + }, + { + "epoch": 3.511111111111111, + "grad_norm": 0.7082911133766174, + "learning_rate": 0.0002, + "loss": 1.472, + "step": 1580 + }, + { + "epoch": 3.533333333333333, + "grad_norm": 0.6159376502037048, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 1590 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.5972959399223328, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 1600 + }, + { + "epoch": 3.5777777777777775, + "grad_norm": 0.5787310004234314, + "learning_rate": 0.0002, + "loss": 1.4876, + "step": 1610 + }, + { + "epoch": 3.6, + "grad_norm": 0.5846341252326965, + "learning_rate": 0.0002, + "loss": 1.4887, + "step": 1620 + }, + { + "epoch": 3.6222222222222222, + "grad_norm": 0.5906197428703308, + "learning_rate": 0.0002, + "loss": 1.542, + "step": 1630 + }, + { + "epoch": 3.6444444444444444, + "grad_norm": 0.6305760145187378, + "learning_rate": 0.0002, + "loss": 1.4941, + "step": 1640 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.7448979616165161, + "learning_rate": 0.0002, + "loss": 1.4677, + "step": 1650 + }, + { + "epoch": 3.688888888888889, + "grad_norm": 0.5906165242195129, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 1660 + }, + { + "epoch": 3.7111111111111112, + "grad_norm": 0.605032742023468, + "learning_rate": 0.0002, + "loss": 1.4882, + "step": 1670 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 0.6117229461669922, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 1680 + }, + { + "epoch": 3.7555555555555555, + "grad_norm": 0.613581120967865, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 1690 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.6244436502456665, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 1700 + }, + { + "epoch": 3.8, + "grad_norm": 0.6236702799797058, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 1710 + }, + { + "epoch": 3.822222222222222, + "grad_norm": 0.639141857624054, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 1720 + }, + { + "epoch": 3.8444444444444446, + "grad_norm": 0.5782344937324524, + "learning_rate": 0.0002, + "loss": 1.536, + "step": 1730 + }, + { + "epoch": 3.8666666666666667, + "grad_norm": 0.5952938795089722, + "learning_rate": 0.0002, + "loss": 1.5355, + "step": 1740 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.5573042035102844, + "learning_rate": 0.0002, + "loss": 1.5205, + "step": 1750 + }, + { + "epoch": 3.911111111111111, + "grad_norm": 0.6114351749420166, + "learning_rate": 0.0002, + "loss": 1.5066, + "step": 1760 + }, + { + "epoch": 3.9333333333333336, + "grad_norm": 0.5973817110061646, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 1770 + }, + { + "epoch": 3.9555555555555557, + "grad_norm": 0.602317750453949, + "learning_rate": 0.0002, + "loss": 1.5003, + "step": 1780 + }, + { + "epoch": 3.977777777777778, + "grad_norm": 0.5965437293052673, + "learning_rate": 0.0002, + "loss": 1.5022, + "step": 1790 + }, + { + "epoch": 4.0, + "grad_norm": 0.5641552209854126, + "learning_rate": 0.0002, + "loss": 1.5031, + "step": 1800 + }, + { + "epoch": 4.0, + "eval_loss": 1.892098069190979, + "eval_runtime": 38.8755, + "eval_samples_per_second": 13.247, + "eval_steps_per_second": 1.672, + "step": 1800 + }, + { + "epoch": 4.022222222222222, + "grad_norm": 0.8302594423294067, + "learning_rate": 0.0002, + "loss": 1.3894, + "step": 1810 + }, + { + "epoch": 4.044444444444444, + "grad_norm": 0.6695230603218079, + "learning_rate": 0.0002, + "loss": 1.3727, + "step": 1820 + }, + { + "epoch": 4.066666666666666, + "grad_norm": 0.7911471128463745, + "learning_rate": 0.0002, + "loss": 1.3064, + "step": 1830 + }, + { + "epoch": 4.088888888888889, + "grad_norm": 0.7044888138771057, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 1840 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.7057249546051025, + "learning_rate": 0.0002, + "loss": 1.3941, + "step": 1850 + }, + { + "epoch": 4.133333333333334, + "grad_norm": 0.8762815594673157, + "learning_rate": 0.0002, + "loss": 1.4052, + "step": 1860 + }, + { + "epoch": 4.155555555555556, + "grad_norm": 0.7619158029556274, + "learning_rate": 0.0002, + "loss": 1.3784, + "step": 1870 + }, + { + "epoch": 4.177777777777778, + "grad_norm": 0.7711658477783203, + "learning_rate": 0.0002, + "loss": 1.3581, + "step": 1880 + }, + { + "epoch": 4.2, + "grad_norm": 0.9732598662376404, + "learning_rate": 0.0002, + "loss": 1.3995, + "step": 1890 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.9070265889167786, + "learning_rate": 0.0002, + "loss": 1.3353, + "step": 1900 + }, + { + "epoch": 4.2444444444444445, + "grad_norm": 0.8274767994880676, + "learning_rate": 0.0002, + "loss": 1.3947, + "step": 1910 + }, + { + "epoch": 4.266666666666667, + "grad_norm": 0.8514227271080017, + "learning_rate": 0.0002, + "loss": 1.3392, + "step": 1920 + }, + { + "epoch": 4.288888888888889, + "grad_norm": 0.7356534600257874, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 1930 + }, + { + "epoch": 4.311111111111111, + "grad_norm": 0.8226608037948608, + "learning_rate": 0.0002, + "loss": 1.3708, + "step": 1940 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.8347907066345215, + "learning_rate": 0.0002, + "loss": 1.3652, + "step": 1950 + }, + { + "epoch": 4.355555555555555, + "grad_norm": 0.8509323000907898, + "learning_rate": 0.0002, + "loss": 1.3415, + "step": 1960 + }, + { + "epoch": 4.377777777777778, + "grad_norm": 0.8776063323020935, + "learning_rate": 0.0002, + "loss": 1.3796, + "step": 1970 + }, + { + "epoch": 4.4, + "grad_norm": 0.8022271990776062, + "learning_rate": 0.0002, + "loss": 1.438, + "step": 1980 + }, + { + "epoch": 4.4222222222222225, + "grad_norm": 0.7984752058982849, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1990 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.7349720001220703, + "learning_rate": 0.0002, + "loss": 1.4214, + "step": 2000 + }, + { + "epoch": 4.466666666666667, + "grad_norm": 0.7778817415237427, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 2010 + }, + { + "epoch": 4.488888888888889, + "grad_norm": 0.9361467361450195, + "learning_rate": 0.0002, + "loss": 1.3365, + "step": 2020 + }, + { + "epoch": 4.511111111111111, + "grad_norm": 0.7839348912239075, + "learning_rate": 0.0002, + "loss": 1.4129, + "step": 2030 + }, + { + "epoch": 4.533333333333333, + "grad_norm": 0.8361981511116028, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 2040 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 1.9877147674560547, + "learning_rate": 0.0002, + "loss": 1.4085, + "step": 2050 + }, + { + "epoch": 4.5777777777777775, + "grad_norm": 0.7506140470504761, + "learning_rate": 0.0002, + "loss": 1.329, + "step": 2060 + }, + { + "epoch": 4.6, + "grad_norm": 0.9493570327758789, + "learning_rate": 0.0002, + "loss": 1.3557, + "step": 2070 + }, + { + "epoch": 4.622222222222222, + "grad_norm": 0.7198925018310547, + "learning_rate": 0.0002, + "loss": 1.438, + "step": 2080 + }, + { + "epoch": 4.644444444444445, + "grad_norm": 0.7521472573280334, + "learning_rate": 0.0002, + "loss": 1.3892, + "step": 2090 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.766718327999115, + "learning_rate": 0.0002, + "loss": 1.3833, + "step": 2100 + }, + { + "epoch": 4.688888888888889, + "grad_norm": 0.9162390232086182, + "learning_rate": 0.0002, + "loss": 1.3541, + "step": 2110 + }, + { + "epoch": 4.711111111111111, + "grad_norm": 0.8980328440666199, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 2120 + }, + { + "epoch": 4.733333333333333, + "grad_norm": 0.8109711408615112, + "learning_rate": 0.0002, + "loss": 1.4043, + "step": 2130 + }, + { + "epoch": 4.7555555555555555, + "grad_norm": 0.7372606992721558, + "learning_rate": 0.0002, + "loss": 1.373, + "step": 2140 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.7527457475662231, + "learning_rate": 0.0002, + "loss": 1.4439, + "step": 2150 + }, + { + "epoch": 4.8, + "grad_norm": 1.0380001068115234, + "learning_rate": 0.0002, + "loss": 1.2999, + "step": 2160 + }, + { + "epoch": 4.822222222222222, + "grad_norm": 0.7166368365287781, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 2170 + }, + { + "epoch": 4.844444444444444, + "grad_norm": 0.784548282623291, + "learning_rate": 0.0002, + "loss": 1.3917, + "step": 2180 + }, + { + "epoch": 4.866666666666667, + "grad_norm": 0.7771317958831787, + "learning_rate": 0.0002, + "loss": 1.3376, + "step": 2190 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.7710300087928772, + "learning_rate": 0.0002, + "loss": 1.3315, + "step": 2200 + }, + { + "epoch": 4.911111111111111, + "grad_norm": 0.7715084552764893, + "learning_rate": 0.0002, + "loss": 1.3676, + "step": 2210 + }, + { + "epoch": 4.933333333333334, + "grad_norm": 0.7888006567955017, + "learning_rate": 0.0002, + "loss": 1.5352, + "step": 2220 + }, + { + "epoch": 4.955555555555556, + "grad_norm": 0.800684928894043, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 2230 + }, + { + "epoch": 4.977777777777778, + "grad_norm": 0.7710039019584656, + "learning_rate": 0.0002, + "loss": 1.4343, + "step": 2240 + }, + { + "epoch": 5.0, + "grad_norm": 0.8617033958435059, + "learning_rate": 0.0002, + "loss": 1.3501, + "step": 2250 + }, + { + "epoch": 5.0, + "eval_loss": 1.9718151092529297, + "eval_runtime": 38.8999, + "eval_samples_per_second": 13.239, + "eval_steps_per_second": 1.671, + "step": 2250 + } + ], + "logging_steps": 10, + "max_steps": 3600, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.04124962045952e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08bcb50b9f534803ed2a4a4be696b74ae373982a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:333a3e3301276a753d48f5fabf4c169b8961c6e6ef7b7a7f931ba463d72983a4 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..98dfee3b493ee2595f135c545a03f7e65c3fbb24 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:479f85e7ebbf7c1d6b53b9f132e2452fe9dcf6e249840e4ba0ab1ab425d5c6a2 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..13281c1a24c79107a5084ac9e65a64530776f758 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:106ecba5ddbdd7422d0d3be12ede848119781f26c67fac5f9fcaa072bdc6a7f0 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0da854ec6a76fd0443d1aa7d0fdad661036731ae --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83f46de925b4b29d962874ab75b54128a2350d129a7af7f8ae9d342fedf66678 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6864060e60a198946db2ac64be407d85a2f46cd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed09341560ed41402e61e19e86133af3658c0bca877a54ba1c2f4a038b86d994 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..54b127820fd9b3302e1e29b183137bc99bb3a53f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/trainer_state.json @@ -0,0 +1,1971 @@ +{ + "best_metric": 1.8250652551651, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 2700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022222222222222223, + "grad_norm": 0.5109436511993408, + "learning_rate": 0.0002, + "loss": 2.5874, + "step": 10 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 0.4870035946369171, + "learning_rate": 0.0002, + "loss": 2.2952, + "step": 20 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.535464882850647, + "learning_rate": 0.0002, + "loss": 2.0554, + "step": 30 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.49077996611595154, + "learning_rate": 0.0002, + "loss": 2.0067, + "step": 40 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.4671357572078705, + "learning_rate": 0.0002, + "loss": 2.0673, + "step": 50 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.4970313608646393, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 60 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 0.4438260495662689, + "learning_rate": 0.0002, + "loss": 1.9419, + "step": 70 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.5089705586433411, + "learning_rate": 0.0002, + "loss": 1.9856, + "step": 80 + }, + { + "epoch": 0.2, + "grad_norm": 0.4645078182220459, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 90 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.46095192432403564, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 100 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 0.43338075280189514, + "learning_rate": 0.0002, + "loss": 1.891, + "step": 110 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.4433900713920593, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 120 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 0.7018499970436096, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 130 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 0.37056994438171387, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 140 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.40634623169898987, + "learning_rate": 0.0002, + "loss": 1.8059, + "step": 150 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.41917353868484497, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 160 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.42392489314079285, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 170 + }, + { + "epoch": 0.4, + "grad_norm": 0.4281010627746582, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 180 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 0.38542497158050537, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 190 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.36003032326698303, + "learning_rate": 0.0002, + "loss": 1.8951, + "step": 200 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.37858229875564575, + "learning_rate": 0.0002, + "loss": 1.8611, + "step": 210 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 0.49986031651496887, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 220 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 0.3937094807624817, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 230 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4566134512424469, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 240 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.3602476418018341, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 250 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 0.36321184039115906, + "learning_rate": 0.0002, + "loss": 1.7963, + "step": 260 + }, + { + "epoch": 0.6, + "grad_norm": 0.3808199167251587, + "learning_rate": 0.0002, + "loss": 1.9055, + "step": 270 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.38910621404647827, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 280 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 0.31913551688194275, + "learning_rate": 0.0002, + "loss": 1.7958, + "step": 290 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.34734025597572327, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 300 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 0.3517725467681885, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 310 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.3804526627063751, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 320 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.5592505931854248, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 330 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 0.36154472827911377, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 340 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.43970227241516113, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 350 + }, + { + "epoch": 0.8, + "grad_norm": 0.3525223731994629, + "learning_rate": 0.0002, + "loss": 1.8333, + "step": 360 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 0.3706997036933899, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 370 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 0.34138166904449463, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 380 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.4090622365474701, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 390 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.3729974031448364, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 400 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 0.3742152452468872, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 410 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.37685129046440125, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 420 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 430 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.31139856576919556, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 440 + }, + { + "epoch": 1.0, + "grad_norm": 0.3577502965927124, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8310279846191406, + "eval_runtime": 38.8374, + "eval_samples_per_second": 13.26, + "eval_steps_per_second": 1.674, + "step": 450 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 0.3257788419723511, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 460 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 0.3878970146179199, + "learning_rate": 0.0002, + "loss": 1.8031, + "step": 470 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.364427387714386, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 480 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 0.3374682664871216, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 490 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.35822123289108276, + "learning_rate": 0.0002, + "loss": 1.7308, + "step": 500 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.3748345673084259, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 510 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 0.3422437012195587, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 520 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 0.4289326071739197, + "learning_rate": 0.0002, + "loss": 1.7649, + "step": 530 + }, + { + "epoch": 1.2, + "grad_norm": 0.3706769645214081, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 540 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.4024733603000641, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 550 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 0.3960128128528595, + "learning_rate": 0.0002, + "loss": 1.7641, + "step": 560 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.38222864270210266, + "learning_rate": 0.0002, + "loss": 1.755, + "step": 570 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 0.4073713421821594, + "learning_rate": 0.0002, + "loss": 1.735, + "step": 580 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 0.3875499963760376, + "learning_rate": 0.0002, + "loss": 1.7183, + "step": 590 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.39740806818008423, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 600 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.38432490825653076, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 610 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 0.402729868888855, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 620 + }, + { + "epoch": 1.4, + "grad_norm": 0.36683231592178345, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 630 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 0.3883286714553833, + "learning_rate": 0.0002, + "loss": 1.8059, + "step": 640 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4087409973144531, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 650 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.4042017459869385, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 660 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 0.40149256587028503, + "learning_rate": 0.0002, + "loss": 1.7466, + "step": 670 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 0.45146510004997253, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 680 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.4098089039325714, + "learning_rate": 0.0002, + "loss": 1.6923, + "step": 690 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.4181336760520935, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 700 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 1.3722974061965942, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 710 + }, + { + "epoch": 1.6, + "grad_norm": 0.3965230882167816, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 720 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 0.3842000663280487, + "learning_rate": 0.0002, + "loss": 1.7782, + "step": 730 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 0.3603688180446625, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 740 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.39973509311676025, + "learning_rate": 0.0002, + "loss": 1.7476, + "step": 750 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 0.3687385618686676, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 760 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 0.4267722964286804, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 770 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.41301295161247253, + "learning_rate": 0.0002, + "loss": 1.8041, + "step": 780 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 0.3945430517196655, + "learning_rate": 0.0002, + "loss": 1.7284, + "step": 790 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.4037930965423584, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 800 + }, + { + "epoch": 1.8, + "grad_norm": 0.406893253326416, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 810 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 0.4600457549095154, + "learning_rate": 0.0002, + "loss": 1.7628, + "step": 820 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 0.4195384085178375, + "learning_rate": 0.0002, + "loss": 1.8222, + "step": 830 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.3854130506515503, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 840 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.38279038667678833, + "learning_rate": 0.0002, + "loss": 1.7227, + "step": 850 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 0.38249439001083374, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 860 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.42977792024612427, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 870 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 0.4109351933002472, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 880 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 0.3734486699104309, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 890 + }, + { + "epoch": 2.0, + "grad_norm": 0.3603087067604065, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 900 + }, + { + "epoch": 2.0, + "eval_loss": 1.8250652551651, + "eval_runtime": 38.8657, + "eval_samples_per_second": 13.251, + "eval_steps_per_second": 1.672, + "step": 900 + }, + { + "epoch": 2.022222222222222, + "grad_norm": 0.4014144241809845, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 910 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 0.4338063597679138, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 920 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 0.3693605065345764, + "learning_rate": 0.0002, + "loss": 1.6354, + "step": 930 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 0.4040255844593048, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 940 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.43481820821762085, + "learning_rate": 0.0002, + "loss": 1.5976, + "step": 950 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.41632869839668274, + "learning_rate": 0.0002, + "loss": 1.6081, + "step": 960 + }, + { + "epoch": 2.1555555555555554, + "grad_norm": 0.4633755385875702, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 970 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 0.43926581740379333, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 980 + }, + { + "epoch": 2.2, + "grad_norm": 0.4757233262062073, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 990 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5010586977005005, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 1000 + }, + { + "epoch": 2.2444444444444445, + "grad_norm": 0.44900986552238464, + "learning_rate": 0.0002, + "loss": 1.6454, + "step": 1010 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.41274750232696533, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 1020 + }, + { + "epoch": 2.2888888888888888, + "grad_norm": 0.44672393798828125, + "learning_rate": 0.0002, + "loss": 1.6592, + "step": 1030 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 0.4826269745826721, + "learning_rate": 0.0002, + "loss": 1.6349, + "step": 1040 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.4650685489177704, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 1050 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 0.42507848143577576, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 1060 + }, + { + "epoch": 2.3777777777777778, + "grad_norm": 0.45653030276298523, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1070 + }, + { + "epoch": 2.4, + "grad_norm": 0.44534122943878174, + "learning_rate": 0.0002, + "loss": 1.6469, + "step": 1080 + }, + { + "epoch": 2.422222222222222, + "grad_norm": 0.4241289794445038, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 1090 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.5004808306694031, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 1100 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 0.41425490379333496, + "learning_rate": 0.0002, + "loss": 1.6833, + "step": 1110 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 0.44362279772758484, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1120 + }, + { + "epoch": 2.511111111111111, + "grad_norm": 0.5530985593795776, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 1130 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.4290637969970703, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1140 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.4957487881183624, + "learning_rate": 0.0002, + "loss": 1.76, + "step": 1150 + }, + { + "epoch": 2.5777777777777775, + "grad_norm": 0.5082747340202332, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 1160 + }, + { + "epoch": 2.6, + "grad_norm": 0.478722482919693, + "learning_rate": 0.0002, + "loss": 1.6702, + "step": 1170 + }, + { + "epoch": 2.6222222222222222, + "grad_norm": 0.436454176902771, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 1180 + }, + { + "epoch": 2.6444444444444444, + "grad_norm": 0.4905032515525818, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 1190 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.4815700054168701, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 1200 + }, + { + "epoch": 2.688888888888889, + "grad_norm": 0.3965534269809723, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 1210 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 0.43282169103622437, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 1220 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 0.45512479543685913, + "learning_rate": 0.0002, + "loss": 1.6455, + "step": 1230 + }, + { + "epoch": 2.7555555555555555, + "grad_norm": 0.44370076060295105, + "learning_rate": 0.0002, + "loss": 1.6444, + "step": 1240 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.4750686287879944, + "learning_rate": 0.0002, + "loss": 1.7106, + "step": 1250 + }, + { + "epoch": 2.8, + "grad_norm": 0.41953766345977783, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 1260 + }, + { + "epoch": 2.822222222222222, + "grad_norm": 0.4887140095233917, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 1270 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 0.46718958020210266, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 1280 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 0.48510900139808655, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 1290 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.4504084289073944, + "learning_rate": 0.0002, + "loss": 1.6825, + "step": 1300 + }, + { + "epoch": 2.911111111111111, + "grad_norm": 0.42119622230529785, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 1310 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 0.4763694107532501, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 1320 + }, + { + "epoch": 2.9555555555555557, + "grad_norm": 0.422810822725296, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1330 + }, + { + "epoch": 2.977777777777778, + "grad_norm": 0.4768871068954468, + "learning_rate": 0.0002, + "loss": 1.6578, + "step": 1340 + }, + { + "epoch": 3.0, + "grad_norm": 0.48259881138801575, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 1350 + }, + { + "epoch": 3.0, + "eval_loss": 1.8452560901641846, + "eval_runtime": 38.8621, + "eval_samples_per_second": 13.252, + "eval_steps_per_second": 1.673, + "step": 1350 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 0.6933313012123108, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 1360 + }, + { + "epoch": 3.0444444444444443, + "grad_norm": 0.5870710611343384, + "learning_rate": 0.0002, + "loss": 1.5542, + "step": 1370 + }, + { + "epoch": 3.066666666666667, + "grad_norm": 0.602210283279419, + "learning_rate": 0.0002, + "loss": 1.511, + "step": 1380 + }, + { + "epoch": 3.088888888888889, + "grad_norm": 0.6461787819862366, + "learning_rate": 0.0002, + "loss": 1.5272, + "step": 1390 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.5839587450027466, + "learning_rate": 0.0002, + "loss": 1.4813, + "step": 1400 + }, + { + "epoch": 3.1333333333333333, + "grad_norm": 0.5757876038551331, + "learning_rate": 0.0002, + "loss": 1.505, + "step": 1410 + }, + { + "epoch": 3.1555555555555554, + "grad_norm": 0.5862616300582886, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 1420 + }, + { + "epoch": 3.1777777777777776, + "grad_norm": 0.6103630065917969, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 1430 + }, + { + "epoch": 3.2, + "grad_norm": 0.9309254884719849, + "learning_rate": 0.0002, + "loss": 1.5406, + "step": 1440 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.5360018014907837, + "learning_rate": 0.0002, + "loss": 1.487, + "step": 1450 + }, + { + "epoch": 3.2444444444444445, + "grad_norm": 0.5448758602142334, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 1460 + }, + { + "epoch": 3.2666666666666666, + "grad_norm": 0.5973812341690063, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 1470 + }, + { + "epoch": 3.2888888888888888, + "grad_norm": 0.6245622038841248, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 1480 + }, + { + "epoch": 3.311111111111111, + "grad_norm": 0.6533768773078918, + "learning_rate": 0.0002, + "loss": 1.4795, + "step": 1490 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.5765811204910278, + "learning_rate": 0.0002, + "loss": 1.5562, + "step": 1500 + }, + { + "epoch": 3.3555555555555556, + "grad_norm": 0.591395378112793, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 1510 + }, + { + "epoch": 3.3777777777777778, + "grad_norm": 0.5842425227165222, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 1520 + }, + { + "epoch": 3.4, + "grad_norm": 0.5731365084648132, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 1530 + }, + { + "epoch": 3.422222222222222, + "grad_norm": 0.5841306447982788, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 1540 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.6503536701202393, + "learning_rate": 0.0002, + "loss": 1.4922, + "step": 1550 + }, + { + "epoch": 3.466666666666667, + "grad_norm": 0.6170967221260071, + "learning_rate": 0.0002, + "loss": 1.5493, + "step": 1560 + }, + { + "epoch": 3.488888888888889, + "grad_norm": 0.5576487183570862, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 1570 + }, + { + "epoch": 3.511111111111111, + "grad_norm": 0.7082911133766174, + "learning_rate": 0.0002, + "loss": 1.472, + "step": 1580 + }, + { + "epoch": 3.533333333333333, + "grad_norm": 0.6159376502037048, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 1590 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.5972959399223328, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 1600 + }, + { + "epoch": 3.5777777777777775, + "grad_norm": 0.5787310004234314, + "learning_rate": 0.0002, + "loss": 1.4876, + "step": 1610 + }, + { + "epoch": 3.6, + "grad_norm": 0.5846341252326965, + "learning_rate": 0.0002, + "loss": 1.4887, + "step": 1620 + }, + { + "epoch": 3.6222222222222222, + "grad_norm": 0.5906197428703308, + "learning_rate": 0.0002, + "loss": 1.542, + "step": 1630 + }, + { + "epoch": 3.6444444444444444, + "grad_norm": 0.6305760145187378, + "learning_rate": 0.0002, + "loss": 1.4941, + "step": 1640 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.7448979616165161, + "learning_rate": 0.0002, + "loss": 1.4677, + "step": 1650 + }, + { + "epoch": 3.688888888888889, + "grad_norm": 0.5906165242195129, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 1660 + }, + { + "epoch": 3.7111111111111112, + "grad_norm": 0.605032742023468, + "learning_rate": 0.0002, + "loss": 1.4882, + "step": 1670 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 0.6117229461669922, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 1680 + }, + { + "epoch": 3.7555555555555555, + "grad_norm": 0.613581120967865, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 1690 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.6244436502456665, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 1700 + }, + { + "epoch": 3.8, + "grad_norm": 0.6236702799797058, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 1710 + }, + { + "epoch": 3.822222222222222, + "grad_norm": 0.639141857624054, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 1720 + }, + { + "epoch": 3.8444444444444446, + "grad_norm": 0.5782344937324524, + "learning_rate": 0.0002, + "loss": 1.536, + "step": 1730 + }, + { + "epoch": 3.8666666666666667, + "grad_norm": 0.5952938795089722, + "learning_rate": 0.0002, + "loss": 1.5355, + "step": 1740 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.5573042035102844, + "learning_rate": 0.0002, + "loss": 1.5205, + "step": 1750 + }, + { + "epoch": 3.911111111111111, + "grad_norm": 0.6114351749420166, + "learning_rate": 0.0002, + "loss": 1.5066, + "step": 1760 + }, + { + "epoch": 3.9333333333333336, + "grad_norm": 0.5973817110061646, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 1770 + }, + { + "epoch": 3.9555555555555557, + "grad_norm": 0.602317750453949, + "learning_rate": 0.0002, + "loss": 1.5003, + "step": 1780 + }, + { + "epoch": 3.977777777777778, + "grad_norm": 0.5965437293052673, + "learning_rate": 0.0002, + "loss": 1.5022, + "step": 1790 + }, + { + "epoch": 4.0, + "grad_norm": 0.5641552209854126, + "learning_rate": 0.0002, + "loss": 1.5031, + "step": 1800 + }, + { + "epoch": 4.0, + "eval_loss": 1.892098069190979, + "eval_runtime": 38.8755, + "eval_samples_per_second": 13.247, + "eval_steps_per_second": 1.672, + "step": 1800 + }, + { + "epoch": 4.022222222222222, + "grad_norm": 0.8302594423294067, + "learning_rate": 0.0002, + "loss": 1.3894, + "step": 1810 + }, + { + "epoch": 4.044444444444444, + "grad_norm": 0.6695230603218079, + "learning_rate": 0.0002, + "loss": 1.3727, + "step": 1820 + }, + { + "epoch": 4.066666666666666, + "grad_norm": 0.7911471128463745, + "learning_rate": 0.0002, + "loss": 1.3064, + "step": 1830 + }, + { + "epoch": 4.088888888888889, + "grad_norm": 0.7044888138771057, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 1840 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.7057249546051025, + "learning_rate": 0.0002, + "loss": 1.3941, + "step": 1850 + }, + { + "epoch": 4.133333333333334, + "grad_norm": 0.8762815594673157, + "learning_rate": 0.0002, + "loss": 1.4052, + "step": 1860 + }, + { + "epoch": 4.155555555555556, + "grad_norm": 0.7619158029556274, + "learning_rate": 0.0002, + "loss": 1.3784, + "step": 1870 + }, + { + "epoch": 4.177777777777778, + "grad_norm": 0.7711658477783203, + "learning_rate": 0.0002, + "loss": 1.3581, + "step": 1880 + }, + { + "epoch": 4.2, + "grad_norm": 0.9732598662376404, + "learning_rate": 0.0002, + "loss": 1.3995, + "step": 1890 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.9070265889167786, + "learning_rate": 0.0002, + "loss": 1.3353, + "step": 1900 + }, + { + "epoch": 4.2444444444444445, + "grad_norm": 0.8274767994880676, + "learning_rate": 0.0002, + "loss": 1.3947, + "step": 1910 + }, + { + "epoch": 4.266666666666667, + "grad_norm": 0.8514227271080017, + "learning_rate": 0.0002, + "loss": 1.3392, + "step": 1920 + }, + { + "epoch": 4.288888888888889, + "grad_norm": 0.7356534600257874, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 1930 + }, + { + "epoch": 4.311111111111111, + "grad_norm": 0.8226608037948608, + "learning_rate": 0.0002, + "loss": 1.3708, + "step": 1940 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.8347907066345215, + "learning_rate": 0.0002, + "loss": 1.3652, + "step": 1950 + }, + { + "epoch": 4.355555555555555, + "grad_norm": 0.8509323000907898, + "learning_rate": 0.0002, + "loss": 1.3415, + "step": 1960 + }, + { + "epoch": 4.377777777777778, + "grad_norm": 0.8776063323020935, + "learning_rate": 0.0002, + "loss": 1.3796, + "step": 1970 + }, + { + "epoch": 4.4, + "grad_norm": 0.8022271990776062, + "learning_rate": 0.0002, + "loss": 1.438, + "step": 1980 + }, + { + "epoch": 4.4222222222222225, + "grad_norm": 0.7984752058982849, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1990 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.7349720001220703, + "learning_rate": 0.0002, + "loss": 1.4214, + "step": 2000 + }, + { + "epoch": 4.466666666666667, + "grad_norm": 0.7778817415237427, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 2010 + }, + { + "epoch": 4.488888888888889, + "grad_norm": 0.9361467361450195, + "learning_rate": 0.0002, + "loss": 1.3365, + "step": 2020 + }, + { + "epoch": 4.511111111111111, + "grad_norm": 0.7839348912239075, + "learning_rate": 0.0002, + "loss": 1.4129, + "step": 2030 + }, + { + "epoch": 4.533333333333333, + "grad_norm": 0.8361981511116028, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 2040 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 1.9877147674560547, + "learning_rate": 0.0002, + "loss": 1.4085, + "step": 2050 + }, + { + "epoch": 4.5777777777777775, + "grad_norm": 0.7506140470504761, + "learning_rate": 0.0002, + "loss": 1.329, + "step": 2060 + }, + { + "epoch": 4.6, + "grad_norm": 0.9493570327758789, + "learning_rate": 0.0002, + "loss": 1.3557, + "step": 2070 + }, + { + "epoch": 4.622222222222222, + "grad_norm": 0.7198925018310547, + "learning_rate": 0.0002, + "loss": 1.438, + "step": 2080 + }, + { + "epoch": 4.644444444444445, + "grad_norm": 0.7521472573280334, + "learning_rate": 0.0002, + "loss": 1.3892, + "step": 2090 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.766718327999115, + "learning_rate": 0.0002, + "loss": 1.3833, + "step": 2100 + }, + { + "epoch": 4.688888888888889, + "grad_norm": 0.9162390232086182, + "learning_rate": 0.0002, + "loss": 1.3541, + "step": 2110 + }, + { + "epoch": 4.711111111111111, + "grad_norm": 0.8980328440666199, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 2120 + }, + { + "epoch": 4.733333333333333, + "grad_norm": 0.8109711408615112, + "learning_rate": 0.0002, + "loss": 1.4043, + "step": 2130 + }, + { + "epoch": 4.7555555555555555, + "grad_norm": 0.7372606992721558, + "learning_rate": 0.0002, + "loss": 1.373, + "step": 2140 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.7527457475662231, + "learning_rate": 0.0002, + "loss": 1.4439, + "step": 2150 + }, + { + "epoch": 4.8, + "grad_norm": 1.0380001068115234, + "learning_rate": 0.0002, + "loss": 1.2999, + "step": 2160 + }, + { + "epoch": 4.822222222222222, + "grad_norm": 0.7166368365287781, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 2170 + }, + { + "epoch": 4.844444444444444, + "grad_norm": 0.784548282623291, + "learning_rate": 0.0002, + "loss": 1.3917, + "step": 2180 + }, + { + "epoch": 4.866666666666667, + "grad_norm": 0.7771317958831787, + "learning_rate": 0.0002, + "loss": 1.3376, + "step": 2190 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.7710300087928772, + "learning_rate": 0.0002, + "loss": 1.3315, + "step": 2200 + }, + { + "epoch": 4.911111111111111, + "grad_norm": 0.7715084552764893, + "learning_rate": 0.0002, + "loss": 1.3676, + "step": 2210 + }, + { + "epoch": 4.933333333333334, + "grad_norm": 0.7888006567955017, + "learning_rate": 0.0002, + "loss": 1.5352, + "step": 2220 + }, + { + "epoch": 4.955555555555556, + "grad_norm": 0.800684928894043, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 2230 + }, + { + "epoch": 4.977777777777778, + "grad_norm": 0.7710039019584656, + "learning_rate": 0.0002, + "loss": 1.4343, + "step": 2240 + }, + { + "epoch": 5.0, + "grad_norm": 0.8617033958435059, + "learning_rate": 0.0002, + "loss": 1.3501, + "step": 2250 + }, + { + "epoch": 5.0, + "eval_loss": 1.9718151092529297, + "eval_runtime": 38.8999, + "eval_samples_per_second": 13.239, + "eval_steps_per_second": 1.671, + "step": 2250 + }, + { + "epoch": 5.022222222222222, + "grad_norm": 1.07399582862854, + "learning_rate": 0.0002, + "loss": 1.19, + "step": 2260 + }, + { + "epoch": 5.044444444444444, + "grad_norm": 0.6598460674285889, + "learning_rate": 0.0002, + "loss": 1.2299, + "step": 2270 + }, + { + "epoch": 5.066666666666666, + "grad_norm": 1.1039506196975708, + "learning_rate": 0.0002, + "loss": 1.2333, + "step": 2280 + }, + { + "epoch": 5.088888888888889, + "grad_norm": 1.0624054670333862, + "learning_rate": 0.0002, + "loss": 1.2412, + "step": 2290 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 0.849583625793457, + "learning_rate": 0.0002, + "loss": 1.184, + "step": 2300 + }, + { + "epoch": 5.133333333333334, + "grad_norm": 1.0143699645996094, + "learning_rate": 0.0002, + "loss": 1.1884, + "step": 2310 + }, + { + "epoch": 5.155555555555556, + "grad_norm": 0.8990702629089355, + "learning_rate": 0.0002, + "loss": 1.2133, + "step": 2320 + }, + { + "epoch": 5.177777777777778, + "grad_norm": 0.9822764992713928, + "learning_rate": 0.0002, + "loss": 1.2091, + "step": 2330 + }, + { + "epoch": 5.2, + "grad_norm": 0.9632459282875061, + "learning_rate": 0.0002, + "loss": 1.1775, + "step": 2340 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 1.0897903442382812, + "learning_rate": 0.0002, + "loss": 1.1821, + "step": 2350 + }, + { + "epoch": 5.2444444444444445, + "grad_norm": 1.155950665473938, + "learning_rate": 0.0002, + "loss": 1.2976, + "step": 2360 + }, + { + "epoch": 5.266666666666667, + "grad_norm": 1.0566821098327637, + "learning_rate": 0.0002, + "loss": 1.1662, + "step": 2370 + }, + { + "epoch": 5.288888888888889, + "grad_norm": 1.191604733467102, + "learning_rate": 0.0002, + "loss": 1.2809, + "step": 2380 + }, + { + "epoch": 5.311111111111111, + "grad_norm": 0.852453887462616, + "learning_rate": 0.0002, + "loss": 1.2431, + "step": 2390 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.9649669528007507, + "learning_rate": 0.0002, + "loss": 1.2106, + "step": 2400 + }, + { + "epoch": 5.355555555555555, + "grad_norm": 1.0731003284454346, + "learning_rate": 0.0002, + "loss": 1.2433, + "step": 2410 + }, + { + "epoch": 5.377777777777778, + "grad_norm": 0.9628495573997498, + "learning_rate": 0.0002, + "loss": 1.1737, + "step": 2420 + }, + { + "epoch": 5.4, + "grad_norm": 0.9268819093704224, + "learning_rate": 0.0002, + "loss": 1.3166, + "step": 2430 + }, + { + "epoch": 5.4222222222222225, + "grad_norm": 1.1104000806808472, + "learning_rate": 0.0002, + "loss": 1.2114, + "step": 2440 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 1.0439373254776, + "learning_rate": 0.0002, + "loss": 1.2151, + "step": 2450 + }, + { + "epoch": 5.466666666666667, + "grad_norm": 1.0366657972335815, + "learning_rate": 0.0002, + "loss": 1.2458, + "step": 2460 + }, + { + "epoch": 5.488888888888889, + "grad_norm": 1.0604808330535889, + "learning_rate": 0.0002, + "loss": 1.2021, + "step": 2470 + }, + { + "epoch": 5.511111111111111, + "grad_norm": 0.8845253586769104, + "learning_rate": 0.0002, + "loss": 1.2188, + "step": 2480 + }, + { + "epoch": 5.533333333333333, + "grad_norm": 0.8200256824493408, + "learning_rate": 0.0002, + "loss": 1.2296, + "step": 2490 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.9628723859786987, + "learning_rate": 0.0002, + "loss": 1.2632, + "step": 2500 + }, + { + "epoch": 5.5777777777777775, + "grad_norm": 1.0758650302886963, + "learning_rate": 0.0002, + "loss": 1.2723, + "step": 2510 + }, + { + "epoch": 5.6, + "grad_norm": 1.0113487243652344, + "learning_rate": 0.0002, + "loss": 1.2298, + "step": 2520 + }, + { + "epoch": 5.622222222222222, + "grad_norm": 1.260536551475525, + "learning_rate": 0.0002, + "loss": 1.2226, + "step": 2530 + }, + { + "epoch": 5.644444444444445, + "grad_norm": 0.9229527115821838, + "learning_rate": 0.0002, + "loss": 1.227, + "step": 2540 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.9378697276115417, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 2550 + }, + { + "epoch": 5.688888888888889, + "grad_norm": 1.0404350757598877, + "learning_rate": 0.0002, + "loss": 1.2759, + "step": 2560 + }, + { + "epoch": 5.711111111111111, + "grad_norm": 1.1879961490631104, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 2570 + }, + { + "epoch": 5.733333333333333, + "grad_norm": 0.8881482481956482, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 2580 + }, + { + "epoch": 5.7555555555555555, + "grad_norm": 1.1428065299987793, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 2590 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.8970609903335571, + "learning_rate": 0.0002, + "loss": 1.2682, + "step": 2600 + }, + { + "epoch": 5.8, + "grad_norm": 1.2084497213363647, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 2610 + }, + { + "epoch": 5.822222222222222, + "grad_norm": 1.04214608669281, + "learning_rate": 0.0002, + "loss": 1.2004, + "step": 2620 + }, + { + "epoch": 5.844444444444444, + "grad_norm": 1.0671849250793457, + "learning_rate": 0.0002, + "loss": 1.2388, + "step": 2630 + }, + { + "epoch": 5.866666666666667, + "grad_norm": 1.009602427482605, + "learning_rate": 0.0002, + "loss": 1.1714, + "step": 2640 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.9787904024124146, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 2650 + }, + { + "epoch": 5.911111111111111, + "grad_norm": 1.0043761730194092, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 2660 + }, + { + "epoch": 5.933333333333334, + "grad_norm": 0.9855443239212036, + "learning_rate": 0.0002, + "loss": 1.2712, + "step": 2670 + }, + { + "epoch": 5.955555555555556, + "grad_norm": 1.1488507986068726, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 2680 + }, + { + "epoch": 5.977777777777778, + "grad_norm": 0.9939966797828674, + "learning_rate": 0.0002, + "loss": 1.2576, + "step": 2690 + }, + { + "epoch": 6.0, + "grad_norm": 1.0444952249526978, + "learning_rate": 0.0002, + "loss": 1.2847, + "step": 2700 + }, + { + "epoch": 6.0, + "eval_loss": 2.0881619453430176, + "eval_runtime": 39.6891, + "eval_samples_per_second": 12.976, + "eval_steps_per_second": 1.638, + "step": 2700 + } + ], + "logging_steps": 10, + "max_steps": 3600, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.249499544551424e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08bcb50b9f534803ed2a4a4be696b74ae373982a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-2700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:333a3e3301276a753d48f5fabf4c169b8961c6e6ef7b7a7f931ba463d72983a4 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..338deb7586b2d831c4323ff615ee179615910524 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ed2ace76009f0684ea50d0b4390701c92f4bf5a92722a4506518e5aa4000944 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e0c43734ddad3831a451593190583f7e1ceab41 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a27859d9fc5c6bf573f703c1f407c7fe61923d547b8231f5f239e58dea6eccc +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..10f4aa536a9fbdbba5043278757bfc418c8e9bcb --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b70cabe7c8fd27eedcb0c00cadf1298f07a9ea51e2ce7a30f4d9e1373023d39 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..036287c3a952701af183368d1f2080b9121f1fd8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5642552f8f0f6f77002f3173ac47fbca753af1a50348f3afb22784275f460142 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3db664b71aacd6f12937e64b40ad2487116c0340 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/trainer_state.json @@ -0,0 +1,2294 @@ +{ + "best_metric": 1.8250652551651, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900", + "epoch": 7.0, + "eval_steps": 10, + "global_step": 3150, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022222222222222223, + "grad_norm": 0.5109436511993408, + "learning_rate": 0.0002, + "loss": 2.5874, + "step": 10 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 0.4870035946369171, + "learning_rate": 0.0002, + "loss": 2.2952, + "step": 20 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.535464882850647, + "learning_rate": 0.0002, + "loss": 2.0554, + "step": 30 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.49077996611595154, + "learning_rate": 0.0002, + "loss": 2.0067, + "step": 40 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.4671357572078705, + "learning_rate": 0.0002, + "loss": 2.0673, + "step": 50 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.4970313608646393, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 60 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 0.4438260495662689, + "learning_rate": 0.0002, + "loss": 1.9419, + "step": 70 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.5089705586433411, + "learning_rate": 0.0002, + "loss": 1.9856, + "step": 80 + }, + { + "epoch": 0.2, + "grad_norm": 0.4645078182220459, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 90 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.46095192432403564, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 100 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 0.43338075280189514, + "learning_rate": 0.0002, + "loss": 1.891, + "step": 110 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.4433900713920593, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 120 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 0.7018499970436096, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 130 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 0.37056994438171387, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 140 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.40634623169898987, + "learning_rate": 0.0002, + "loss": 1.8059, + "step": 150 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.41917353868484497, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 160 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.42392489314079285, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 170 + }, + { + "epoch": 0.4, + "grad_norm": 0.4281010627746582, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 180 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 0.38542497158050537, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 190 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.36003032326698303, + "learning_rate": 0.0002, + "loss": 1.8951, + "step": 200 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.37858229875564575, + "learning_rate": 0.0002, + "loss": 1.8611, + "step": 210 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 0.49986031651496887, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 220 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 0.3937094807624817, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 230 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4566134512424469, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 240 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.3602476418018341, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 250 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 0.36321184039115906, + "learning_rate": 0.0002, + "loss": 1.7963, + "step": 260 + }, + { + "epoch": 0.6, + "grad_norm": 0.3808199167251587, + "learning_rate": 0.0002, + "loss": 1.9055, + "step": 270 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.38910621404647827, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 280 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 0.31913551688194275, + "learning_rate": 0.0002, + "loss": 1.7958, + "step": 290 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.34734025597572327, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 300 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 0.3517725467681885, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 310 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.3804526627063751, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 320 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.5592505931854248, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 330 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 0.36154472827911377, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 340 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.43970227241516113, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 350 + }, + { + "epoch": 0.8, + "grad_norm": 0.3525223731994629, + "learning_rate": 0.0002, + "loss": 1.8333, + "step": 360 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 0.3706997036933899, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 370 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 0.34138166904449463, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 380 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.4090622365474701, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 390 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.3729974031448364, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 400 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 0.3742152452468872, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 410 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.37685129046440125, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 420 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 430 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.31139856576919556, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 440 + }, + { + "epoch": 1.0, + "grad_norm": 0.3577502965927124, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8310279846191406, + "eval_runtime": 38.8374, + "eval_samples_per_second": 13.26, + "eval_steps_per_second": 1.674, + "step": 450 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 0.3257788419723511, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 460 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 0.3878970146179199, + "learning_rate": 0.0002, + "loss": 1.8031, + "step": 470 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.364427387714386, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 480 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 0.3374682664871216, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 490 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.35822123289108276, + "learning_rate": 0.0002, + "loss": 1.7308, + "step": 500 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.3748345673084259, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 510 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 0.3422437012195587, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 520 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 0.4289326071739197, + "learning_rate": 0.0002, + "loss": 1.7649, + "step": 530 + }, + { + "epoch": 1.2, + "grad_norm": 0.3706769645214081, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 540 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.4024733603000641, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 550 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 0.3960128128528595, + "learning_rate": 0.0002, + "loss": 1.7641, + "step": 560 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.38222864270210266, + "learning_rate": 0.0002, + "loss": 1.755, + "step": 570 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 0.4073713421821594, + "learning_rate": 0.0002, + "loss": 1.735, + "step": 580 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 0.3875499963760376, + "learning_rate": 0.0002, + "loss": 1.7183, + "step": 590 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.39740806818008423, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 600 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.38432490825653076, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 610 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 0.402729868888855, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 620 + }, + { + "epoch": 1.4, + "grad_norm": 0.36683231592178345, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 630 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 0.3883286714553833, + "learning_rate": 0.0002, + "loss": 1.8059, + "step": 640 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4087409973144531, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 650 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.4042017459869385, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 660 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 0.40149256587028503, + "learning_rate": 0.0002, + "loss": 1.7466, + "step": 670 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 0.45146510004997253, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 680 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.4098089039325714, + "learning_rate": 0.0002, + "loss": 1.6923, + "step": 690 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.4181336760520935, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 700 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 1.3722974061965942, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 710 + }, + { + "epoch": 1.6, + "grad_norm": 0.3965230882167816, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 720 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 0.3842000663280487, + "learning_rate": 0.0002, + "loss": 1.7782, + "step": 730 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 0.3603688180446625, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 740 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.39973509311676025, + "learning_rate": 0.0002, + "loss": 1.7476, + "step": 750 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 0.3687385618686676, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 760 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 0.4267722964286804, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 770 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.41301295161247253, + "learning_rate": 0.0002, + "loss": 1.8041, + "step": 780 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 0.3945430517196655, + "learning_rate": 0.0002, + "loss": 1.7284, + "step": 790 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.4037930965423584, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 800 + }, + { + "epoch": 1.8, + "grad_norm": 0.406893253326416, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 810 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 0.4600457549095154, + "learning_rate": 0.0002, + "loss": 1.7628, + "step": 820 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 0.4195384085178375, + "learning_rate": 0.0002, + "loss": 1.8222, + "step": 830 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.3854130506515503, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 840 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.38279038667678833, + "learning_rate": 0.0002, + "loss": 1.7227, + "step": 850 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 0.38249439001083374, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 860 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.42977792024612427, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 870 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 0.4109351933002472, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 880 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 0.3734486699104309, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 890 + }, + { + "epoch": 2.0, + "grad_norm": 0.3603087067604065, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 900 + }, + { + "epoch": 2.0, + "eval_loss": 1.8250652551651, + "eval_runtime": 38.8657, + "eval_samples_per_second": 13.251, + "eval_steps_per_second": 1.672, + "step": 900 + }, + { + "epoch": 2.022222222222222, + "grad_norm": 0.4014144241809845, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 910 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 0.4338063597679138, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 920 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 0.3693605065345764, + "learning_rate": 0.0002, + "loss": 1.6354, + "step": 930 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 0.4040255844593048, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 940 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.43481820821762085, + "learning_rate": 0.0002, + "loss": 1.5976, + "step": 950 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.41632869839668274, + "learning_rate": 0.0002, + "loss": 1.6081, + "step": 960 + }, + { + "epoch": 2.1555555555555554, + "grad_norm": 0.4633755385875702, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 970 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 0.43926581740379333, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 980 + }, + { + "epoch": 2.2, + "grad_norm": 0.4757233262062073, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 990 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5010586977005005, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 1000 + }, + { + "epoch": 2.2444444444444445, + "grad_norm": 0.44900986552238464, + "learning_rate": 0.0002, + "loss": 1.6454, + "step": 1010 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.41274750232696533, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 1020 + }, + { + "epoch": 2.2888888888888888, + "grad_norm": 0.44672393798828125, + "learning_rate": 0.0002, + "loss": 1.6592, + "step": 1030 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 0.4826269745826721, + "learning_rate": 0.0002, + "loss": 1.6349, + "step": 1040 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.4650685489177704, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 1050 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 0.42507848143577576, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 1060 + }, + { + "epoch": 2.3777777777777778, + "grad_norm": 0.45653030276298523, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1070 + }, + { + "epoch": 2.4, + "grad_norm": 0.44534122943878174, + "learning_rate": 0.0002, + "loss": 1.6469, + "step": 1080 + }, + { + "epoch": 2.422222222222222, + "grad_norm": 0.4241289794445038, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 1090 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.5004808306694031, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 1100 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 0.41425490379333496, + "learning_rate": 0.0002, + "loss": 1.6833, + "step": 1110 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 0.44362279772758484, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1120 + }, + { + "epoch": 2.511111111111111, + "grad_norm": 0.5530985593795776, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 1130 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.4290637969970703, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1140 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.4957487881183624, + "learning_rate": 0.0002, + "loss": 1.76, + "step": 1150 + }, + { + "epoch": 2.5777777777777775, + "grad_norm": 0.5082747340202332, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 1160 + }, + { + "epoch": 2.6, + "grad_norm": 0.478722482919693, + "learning_rate": 0.0002, + "loss": 1.6702, + "step": 1170 + }, + { + "epoch": 2.6222222222222222, + "grad_norm": 0.436454176902771, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 1180 + }, + { + "epoch": 2.6444444444444444, + "grad_norm": 0.4905032515525818, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 1190 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.4815700054168701, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 1200 + }, + { + "epoch": 2.688888888888889, + "grad_norm": 0.3965534269809723, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 1210 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 0.43282169103622437, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 1220 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 0.45512479543685913, + "learning_rate": 0.0002, + "loss": 1.6455, + "step": 1230 + }, + { + "epoch": 2.7555555555555555, + "grad_norm": 0.44370076060295105, + "learning_rate": 0.0002, + "loss": 1.6444, + "step": 1240 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.4750686287879944, + "learning_rate": 0.0002, + "loss": 1.7106, + "step": 1250 + }, + { + "epoch": 2.8, + "grad_norm": 0.41953766345977783, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 1260 + }, + { + "epoch": 2.822222222222222, + "grad_norm": 0.4887140095233917, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 1270 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 0.46718958020210266, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 1280 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 0.48510900139808655, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 1290 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.4504084289073944, + "learning_rate": 0.0002, + "loss": 1.6825, + "step": 1300 + }, + { + "epoch": 2.911111111111111, + "grad_norm": 0.42119622230529785, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 1310 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 0.4763694107532501, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 1320 + }, + { + "epoch": 2.9555555555555557, + "grad_norm": 0.422810822725296, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1330 + }, + { + "epoch": 2.977777777777778, + "grad_norm": 0.4768871068954468, + "learning_rate": 0.0002, + "loss": 1.6578, + "step": 1340 + }, + { + "epoch": 3.0, + "grad_norm": 0.48259881138801575, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 1350 + }, + { + "epoch": 3.0, + "eval_loss": 1.8452560901641846, + "eval_runtime": 38.8621, + "eval_samples_per_second": 13.252, + "eval_steps_per_second": 1.673, + "step": 1350 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 0.6933313012123108, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 1360 + }, + { + "epoch": 3.0444444444444443, + "grad_norm": 0.5870710611343384, + "learning_rate": 0.0002, + "loss": 1.5542, + "step": 1370 + }, + { + "epoch": 3.066666666666667, + "grad_norm": 0.602210283279419, + "learning_rate": 0.0002, + "loss": 1.511, + "step": 1380 + }, + { + "epoch": 3.088888888888889, + "grad_norm": 0.6461787819862366, + "learning_rate": 0.0002, + "loss": 1.5272, + "step": 1390 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.5839587450027466, + "learning_rate": 0.0002, + "loss": 1.4813, + "step": 1400 + }, + { + "epoch": 3.1333333333333333, + "grad_norm": 0.5757876038551331, + "learning_rate": 0.0002, + "loss": 1.505, + "step": 1410 + }, + { + "epoch": 3.1555555555555554, + "grad_norm": 0.5862616300582886, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 1420 + }, + { + "epoch": 3.1777777777777776, + "grad_norm": 0.6103630065917969, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 1430 + }, + { + "epoch": 3.2, + "grad_norm": 0.9309254884719849, + "learning_rate": 0.0002, + "loss": 1.5406, + "step": 1440 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.5360018014907837, + "learning_rate": 0.0002, + "loss": 1.487, + "step": 1450 + }, + { + "epoch": 3.2444444444444445, + "grad_norm": 0.5448758602142334, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 1460 + }, + { + "epoch": 3.2666666666666666, + "grad_norm": 0.5973812341690063, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 1470 + }, + { + "epoch": 3.2888888888888888, + "grad_norm": 0.6245622038841248, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 1480 + }, + { + "epoch": 3.311111111111111, + "grad_norm": 0.6533768773078918, + "learning_rate": 0.0002, + "loss": 1.4795, + "step": 1490 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.5765811204910278, + "learning_rate": 0.0002, + "loss": 1.5562, + "step": 1500 + }, + { + "epoch": 3.3555555555555556, + "grad_norm": 0.591395378112793, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 1510 + }, + { + "epoch": 3.3777777777777778, + "grad_norm": 0.5842425227165222, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 1520 + }, + { + "epoch": 3.4, + "grad_norm": 0.5731365084648132, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 1530 + }, + { + "epoch": 3.422222222222222, + "grad_norm": 0.5841306447982788, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 1540 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.6503536701202393, + "learning_rate": 0.0002, + "loss": 1.4922, + "step": 1550 + }, + { + "epoch": 3.466666666666667, + "grad_norm": 0.6170967221260071, + "learning_rate": 0.0002, + "loss": 1.5493, + "step": 1560 + }, + { + "epoch": 3.488888888888889, + "grad_norm": 0.5576487183570862, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 1570 + }, + { + "epoch": 3.511111111111111, + "grad_norm": 0.7082911133766174, + "learning_rate": 0.0002, + "loss": 1.472, + "step": 1580 + }, + { + "epoch": 3.533333333333333, + "grad_norm": 0.6159376502037048, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 1590 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.5972959399223328, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 1600 + }, + { + "epoch": 3.5777777777777775, + "grad_norm": 0.5787310004234314, + "learning_rate": 0.0002, + "loss": 1.4876, + "step": 1610 + }, + { + "epoch": 3.6, + "grad_norm": 0.5846341252326965, + "learning_rate": 0.0002, + "loss": 1.4887, + "step": 1620 + }, + { + "epoch": 3.6222222222222222, + "grad_norm": 0.5906197428703308, + "learning_rate": 0.0002, + "loss": 1.542, + "step": 1630 + }, + { + "epoch": 3.6444444444444444, + "grad_norm": 0.6305760145187378, + "learning_rate": 0.0002, + "loss": 1.4941, + "step": 1640 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.7448979616165161, + "learning_rate": 0.0002, + "loss": 1.4677, + "step": 1650 + }, + { + "epoch": 3.688888888888889, + "grad_norm": 0.5906165242195129, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 1660 + }, + { + "epoch": 3.7111111111111112, + "grad_norm": 0.605032742023468, + "learning_rate": 0.0002, + "loss": 1.4882, + "step": 1670 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 0.6117229461669922, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 1680 + }, + { + "epoch": 3.7555555555555555, + "grad_norm": 0.613581120967865, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 1690 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.6244436502456665, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 1700 + }, + { + "epoch": 3.8, + "grad_norm": 0.6236702799797058, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 1710 + }, + { + "epoch": 3.822222222222222, + "grad_norm": 0.639141857624054, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 1720 + }, + { + "epoch": 3.8444444444444446, + "grad_norm": 0.5782344937324524, + "learning_rate": 0.0002, + "loss": 1.536, + "step": 1730 + }, + { + "epoch": 3.8666666666666667, + "grad_norm": 0.5952938795089722, + "learning_rate": 0.0002, + "loss": 1.5355, + "step": 1740 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.5573042035102844, + "learning_rate": 0.0002, + "loss": 1.5205, + "step": 1750 + }, + { + "epoch": 3.911111111111111, + "grad_norm": 0.6114351749420166, + "learning_rate": 0.0002, + "loss": 1.5066, + "step": 1760 + }, + { + "epoch": 3.9333333333333336, + "grad_norm": 0.5973817110061646, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 1770 + }, + { + "epoch": 3.9555555555555557, + "grad_norm": 0.602317750453949, + "learning_rate": 0.0002, + "loss": 1.5003, + "step": 1780 + }, + { + "epoch": 3.977777777777778, + "grad_norm": 0.5965437293052673, + "learning_rate": 0.0002, + "loss": 1.5022, + "step": 1790 + }, + { + "epoch": 4.0, + "grad_norm": 0.5641552209854126, + "learning_rate": 0.0002, + "loss": 1.5031, + "step": 1800 + }, + { + "epoch": 4.0, + "eval_loss": 1.892098069190979, + "eval_runtime": 38.8755, + "eval_samples_per_second": 13.247, + "eval_steps_per_second": 1.672, + "step": 1800 + }, + { + "epoch": 4.022222222222222, + "grad_norm": 0.8302594423294067, + "learning_rate": 0.0002, + "loss": 1.3894, + "step": 1810 + }, + { + "epoch": 4.044444444444444, + "grad_norm": 0.6695230603218079, + "learning_rate": 0.0002, + "loss": 1.3727, + "step": 1820 + }, + { + "epoch": 4.066666666666666, + "grad_norm": 0.7911471128463745, + "learning_rate": 0.0002, + "loss": 1.3064, + "step": 1830 + }, + { + "epoch": 4.088888888888889, + "grad_norm": 0.7044888138771057, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 1840 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.7057249546051025, + "learning_rate": 0.0002, + "loss": 1.3941, + "step": 1850 + }, + { + "epoch": 4.133333333333334, + "grad_norm": 0.8762815594673157, + "learning_rate": 0.0002, + "loss": 1.4052, + "step": 1860 + }, + { + "epoch": 4.155555555555556, + "grad_norm": 0.7619158029556274, + "learning_rate": 0.0002, + "loss": 1.3784, + "step": 1870 + }, + { + "epoch": 4.177777777777778, + "grad_norm": 0.7711658477783203, + "learning_rate": 0.0002, + "loss": 1.3581, + "step": 1880 + }, + { + "epoch": 4.2, + "grad_norm": 0.9732598662376404, + "learning_rate": 0.0002, + "loss": 1.3995, + "step": 1890 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.9070265889167786, + "learning_rate": 0.0002, + "loss": 1.3353, + "step": 1900 + }, + { + "epoch": 4.2444444444444445, + "grad_norm": 0.8274767994880676, + "learning_rate": 0.0002, + "loss": 1.3947, + "step": 1910 + }, + { + "epoch": 4.266666666666667, + "grad_norm": 0.8514227271080017, + "learning_rate": 0.0002, + "loss": 1.3392, + "step": 1920 + }, + { + "epoch": 4.288888888888889, + "grad_norm": 0.7356534600257874, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 1930 + }, + { + "epoch": 4.311111111111111, + "grad_norm": 0.8226608037948608, + "learning_rate": 0.0002, + "loss": 1.3708, + "step": 1940 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.8347907066345215, + "learning_rate": 0.0002, + "loss": 1.3652, + "step": 1950 + }, + { + "epoch": 4.355555555555555, + "grad_norm": 0.8509323000907898, + "learning_rate": 0.0002, + "loss": 1.3415, + "step": 1960 + }, + { + "epoch": 4.377777777777778, + "grad_norm": 0.8776063323020935, + "learning_rate": 0.0002, + "loss": 1.3796, + "step": 1970 + }, + { + "epoch": 4.4, + "grad_norm": 0.8022271990776062, + "learning_rate": 0.0002, + "loss": 1.438, + "step": 1980 + }, + { + "epoch": 4.4222222222222225, + "grad_norm": 0.7984752058982849, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1990 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.7349720001220703, + "learning_rate": 0.0002, + "loss": 1.4214, + "step": 2000 + }, + { + "epoch": 4.466666666666667, + "grad_norm": 0.7778817415237427, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 2010 + }, + { + "epoch": 4.488888888888889, + "grad_norm": 0.9361467361450195, + "learning_rate": 0.0002, + "loss": 1.3365, + "step": 2020 + }, + { + "epoch": 4.511111111111111, + "grad_norm": 0.7839348912239075, + "learning_rate": 0.0002, + "loss": 1.4129, + "step": 2030 + }, + { + "epoch": 4.533333333333333, + "grad_norm": 0.8361981511116028, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 2040 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 1.9877147674560547, + "learning_rate": 0.0002, + "loss": 1.4085, + "step": 2050 + }, + { + "epoch": 4.5777777777777775, + "grad_norm": 0.7506140470504761, + "learning_rate": 0.0002, + "loss": 1.329, + "step": 2060 + }, + { + "epoch": 4.6, + "grad_norm": 0.9493570327758789, + "learning_rate": 0.0002, + "loss": 1.3557, + "step": 2070 + }, + { + "epoch": 4.622222222222222, + "grad_norm": 0.7198925018310547, + "learning_rate": 0.0002, + "loss": 1.438, + "step": 2080 + }, + { + "epoch": 4.644444444444445, + "grad_norm": 0.7521472573280334, + "learning_rate": 0.0002, + "loss": 1.3892, + "step": 2090 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.766718327999115, + "learning_rate": 0.0002, + "loss": 1.3833, + "step": 2100 + }, + { + "epoch": 4.688888888888889, + "grad_norm": 0.9162390232086182, + "learning_rate": 0.0002, + "loss": 1.3541, + "step": 2110 + }, + { + "epoch": 4.711111111111111, + "grad_norm": 0.8980328440666199, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 2120 + }, + { + "epoch": 4.733333333333333, + "grad_norm": 0.8109711408615112, + "learning_rate": 0.0002, + "loss": 1.4043, + "step": 2130 + }, + { + "epoch": 4.7555555555555555, + "grad_norm": 0.7372606992721558, + "learning_rate": 0.0002, + "loss": 1.373, + "step": 2140 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.7527457475662231, + "learning_rate": 0.0002, + "loss": 1.4439, + "step": 2150 + }, + { + "epoch": 4.8, + "grad_norm": 1.0380001068115234, + "learning_rate": 0.0002, + "loss": 1.2999, + "step": 2160 + }, + { + "epoch": 4.822222222222222, + "grad_norm": 0.7166368365287781, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 2170 + }, + { + "epoch": 4.844444444444444, + "grad_norm": 0.784548282623291, + "learning_rate": 0.0002, + "loss": 1.3917, + "step": 2180 + }, + { + "epoch": 4.866666666666667, + "grad_norm": 0.7771317958831787, + "learning_rate": 0.0002, + "loss": 1.3376, + "step": 2190 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.7710300087928772, + "learning_rate": 0.0002, + "loss": 1.3315, + "step": 2200 + }, + { + "epoch": 4.911111111111111, + "grad_norm": 0.7715084552764893, + "learning_rate": 0.0002, + "loss": 1.3676, + "step": 2210 + }, + { + "epoch": 4.933333333333334, + "grad_norm": 0.7888006567955017, + "learning_rate": 0.0002, + "loss": 1.5352, + "step": 2220 + }, + { + "epoch": 4.955555555555556, + "grad_norm": 0.800684928894043, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 2230 + }, + { + "epoch": 4.977777777777778, + "grad_norm": 0.7710039019584656, + "learning_rate": 0.0002, + "loss": 1.4343, + "step": 2240 + }, + { + "epoch": 5.0, + "grad_norm": 0.8617033958435059, + "learning_rate": 0.0002, + "loss": 1.3501, + "step": 2250 + }, + { + "epoch": 5.0, + "eval_loss": 1.9718151092529297, + "eval_runtime": 38.8999, + "eval_samples_per_second": 13.239, + "eval_steps_per_second": 1.671, + "step": 2250 + }, + { + "epoch": 5.022222222222222, + "grad_norm": 1.07399582862854, + "learning_rate": 0.0002, + "loss": 1.19, + "step": 2260 + }, + { + "epoch": 5.044444444444444, + "grad_norm": 0.6598460674285889, + "learning_rate": 0.0002, + "loss": 1.2299, + "step": 2270 + }, + { + "epoch": 5.066666666666666, + "grad_norm": 1.1039506196975708, + "learning_rate": 0.0002, + "loss": 1.2333, + "step": 2280 + }, + { + "epoch": 5.088888888888889, + "grad_norm": 1.0624054670333862, + "learning_rate": 0.0002, + "loss": 1.2412, + "step": 2290 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 0.849583625793457, + "learning_rate": 0.0002, + "loss": 1.184, + "step": 2300 + }, + { + "epoch": 5.133333333333334, + "grad_norm": 1.0143699645996094, + "learning_rate": 0.0002, + "loss": 1.1884, + "step": 2310 + }, + { + "epoch": 5.155555555555556, + "grad_norm": 0.8990702629089355, + "learning_rate": 0.0002, + "loss": 1.2133, + "step": 2320 + }, + { + "epoch": 5.177777777777778, + "grad_norm": 0.9822764992713928, + "learning_rate": 0.0002, + "loss": 1.2091, + "step": 2330 + }, + { + "epoch": 5.2, + "grad_norm": 0.9632459282875061, + "learning_rate": 0.0002, + "loss": 1.1775, + "step": 2340 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 1.0897903442382812, + "learning_rate": 0.0002, + "loss": 1.1821, + "step": 2350 + }, + { + "epoch": 5.2444444444444445, + "grad_norm": 1.155950665473938, + "learning_rate": 0.0002, + "loss": 1.2976, + "step": 2360 + }, + { + "epoch": 5.266666666666667, + "grad_norm": 1.0566821098327637, + "learning_rate": 0.0002, + "loss": 1.1662, + "step": 2370 + }, + { + "epoch": 5.288888888888889, + "grad_norm": 1.191604733467102, + "learning_rate": 0.0002, + "loss": 1.2809, + "step": 2380 + }, + { + "epoch": 5.311111111111111, + "grad_norm": 0.852453887462616, + "learning_rate": 0.0002, + "loss": 1.2431, + "step": 2390 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.9649669528007507, + "learning_rate": 0.0002, + "loss": 1.2106, + "step": 2400 + }, + { + "epoch": 5.355555555555555, + "grad_norm": 1.0731003284454346, + "learning_rate": 0.0002, + "loss": 1.2433, + "step": 2410 + }, + { + "epoch": 5.377777777777778, + "grad_norm": 0.9628495573997498, + "learning_rate": 0.0002, + "loss": 1.1737, + "step": 2420 + }, + { + "epoch": 5.4, + "grad_norm": 0.9268819093704224, + "learning_rate": 0.0002, + "loss": 1.3166, + "step": 2430 + }, + { + "epoch": 5.4222222222222225, + "grad_norm": 1.1104000806808472, + "learning_rate": 0.0002, + "loss": 1.2114, + "step": 2440 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 1.0439373254776, + "learning_rate": 0.0002, + "loss": 1.2151, + "step": 2450 + }, + { + "epoch": 5.466666666666667, + "grad_norm": 1.0366657972335815, + "learning_rate": 0.0002, + "loss": 1.2458, + "step": 2460 + }, + { + "epoch": 5.488888888888889, + "grad_norm": 1.0604808330535889, + "learning_rate": 0.0002, + "loss": 1.2021, + "step": 2470 + }, + { + "epoch": 5.511111111111111, + "grad_norm": 0.8845253586769104, + "learning_rate": 0.0002, + "loss": 1.2188, + "step": 2480 + }, + { + "epoch": 5.533333333333333, + "grad_norm": 0.8200256824493408, + "learning_rate": 0.0002, + "loss": 1.2296, + "step": 2490 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.9628723859786987, + "learning_rate": 0.0002, + "loss": 1.2632, + "step": 2500 + }, + { + "epoch": 5.5777777777777775, + "grad_norm": 1.0758650302886963, + "learning_rate": 0.0002, + "loss": 1.2723, + "step": 2510 + }, + { + "epoch": 5.6, + "grad_norm": 1.0113487243652344, + "learning_rate": 0.0002, + "loss": 1.2298, + "step": 2520 + }, + { + "epoch": 5.622222222222222, + "grad_norm": 1.260536551475525, + "learning_rate": 0.0002, + "loss": 1.2226, + "step": 2530 + }, + { + "epoch": 5.644444444444445, + "grad_norm": 0.9229527115821838, + "learning_rate": 0.0002, + "loss": 1.227, + "step": 2540 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.9378697276115417, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 2550 + }, + { + "epoch": 5.688888888888889, + "grad_norm": 1.0404350757598877, + "learning_rate": 0.0002, + "loss": 1.2759, + "step": 2560 + }, + { + "epoch": 5.711111111111111, + "grad_norm": 1.1879961490631104, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 2570 + }, + { + "epoch": 5.733333333333333, + "grad_norm": 0.8881482481956482, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 2580 + }, + { + "epoch": 5.7555555555555555, + "grad_norm": 1.1428065299987793, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 2590 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.8970609903335571, + "learning_rate": 0.0002, + "loss": 1.2682, + "step": 2600 + }, + { + "epoch": 5.8, + "grad_norm": 1.2084497213363647, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 2610 + }, + { + "epoch": 5.822222222222222, + "grad_norm": 1.04214608669281, + "learning_rate": 0.0002, + "loss": 1.2004, + "step": 2620 + }, + { + "epoch": 5.844444444444444, + "grad_norm": 1.0671849250793457, + "learning_rate": 0.0002, + "loss": 1.2388, + "step": 2630 + }, + { + "epoch": 5.866666666666667, + "grad_norm": 1.009602427482605, + "learning_rate": 0.0002, + "loss": 1.1714, + "step": 2640 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.9787904024124146, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 2650 + }, + { + "epoch": 5.911111111111111, + "grad_norm": 1.0043761730194092, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 2660 + }, + { + "epoch": 5.933333333333334, + "grad_norm": 0.9855443239212036, + "learning_rate": 0.0002, + "loss": 1.2712, + "step": 2670 + }, + { + "epoch": 5.955555555555556, + "grad_norm": 1.1488507986068726, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 2680 + }, + { + "epoch": 5.977777777777778, + "grad_norm": 0.9939966797828674, + "learning_rate": 0.0002, + "loss": 1.2576, + "step": 2690 + }, + { + "epoch": 6.0, + "grad_norm": 1.0444952249526978, + "learning_rate": 0.0002, + "loss": 1.2847, + "step": 2700 + }, + { + "epoch": 6.0, + "eval_loss": 2.0881619453430176, + "eval_runtime": 39.6891, + "eval_samples_per_second": 12.976, + "eval_steps_per_second": 1.638, + "step": 2700 + }, + { + "epoch": 6.022222222222222, + "grad_norm": 1.3728636503219604, + "learning_rate": 0.0002, + "loss": 1.0764, + "step": 2710 + }, + { + "epoch": 6.044444444444444, + "grad_norm": 1.06633460521698, + "learning_rate": 0.0002, + "loss": 1.0778, + "step": 2720 + }, + { + "epoch": 6.066666666666666, + "grad_norm": 1.2068440914154053, + "learning_rate": 0.0002, + "loss": 1.0181, + "step": 2730 + }, + { + "epoch": 6.088888888888889, + "grad_norm": 1.248744010925293, + "learning_rate": 0.0002, + "loss": 1.0225, + "step": 2740 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 1.1814687252044678, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 2750 + }, + { + "epoch": 6.133333333333334, + "grad_norm": 1.2335790395736694, + "learning_rate": 0.0002, + "loss": 0.973, + "step": 2760 + }, + { + "epoch": 6.155555555555556, + "grad_norm": 1.0661171674728394, + "learning_rate": 0.0002, + "loss": 1.0193, + "step": 2770 + }, + { + "epoch": 6.177777777777778, + "grad_norm": 1.345876932144165, + "learning_rate": 0.0002, + "loss": 1.0496, + "step": 2780 + }, + { + "epoch": 6.2, + "grad_norm": 1.2426252365112305, + "learning_rate": 0.0002, + "loss": 1.0252, + "step": 2790 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 1.1970592737197876, + "learning_rate": 0.0002, + "loss": 1.0075, + "step": 2800 + }, + { + "epoch": 6.2444444444444445, + "grad_norm": 1.2484612464904785, + "learning_rate": 0.0002, + "loss": 1.1016, + "step": 2810 + }, + { + "epoch": 6.266666666666667, + "grad_norm": 1.2115106582641602, + "learning_rate": 0.0002, + "loss": 1.0032, + "step": 2820 + }, + { + "epoch": 6.288888888888889, + "grad_norm": 1.0024933815002441, + "learning_rate": 0.0002, + "loss": 1.0721, + "step": 2830 + }, + { + "epoch": 6.311111111111111, + "grad_norm": 1.1508114337921143, + "learning_rate": 0.0002, + "loss": 1.0705, + "step": 2840 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 1.1686254739761353, + "learning_rate": 0.0002, + "loss": 1.0632, + "step": 2850 + }, + { + "epoch": 6.355555555555555, + "grad_norm": 1.2702640295028687, + "learning_rate": 0.0002, + "loss": 1.1031, + "step": 2860 + }, + { + "epoch": 6.377777777777778, + "grad_norm": 1.3344615697860718, + "learning_rate": 0.0002, + "loss": 1.1033, + "step": 2870 + }, + { + "epoch": 6.4, + "grad_norm": 1.27545964717865, + "learning_rate": 0.0002, + "loss": 1.1105, + "step": 2880 + }, + { + "epoch": 6.4222222222222225, + "grad_norm": 1.2365739345550537, + "learning_rate": 0.0002, + "loss": 1.0353, + "step": 2890 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 1.3821545839309692, + "learning_rate": 0.0002, + "loss": 1.046, + "step": 2900 + }, + { + "epoch": 6.466666666666667, + "grad_norm": 1.1889359951019287, + "learning_rate": 0.0002, + "loss": 1.0643, + "step": 2910 + }, + { + "epoch": 6.488888888888889, + "grad_norm": 1.1324981451034546, + "learning_rate": 0.0002, + "loss": 1.0173, + "step": 2920 + }, + { + "epoch": 6.511111111111111, + "grad_norm": 1.154468297958374, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 2930 + }, + { + "epoch": 6.533333333333333, + "grad_norm": 1.211300253868103, + "learning_rate": 0.0002, + "loss": 1.1323, + "step": 2940 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 1.3322433233261108, + "learning_rate": 0.0002, + "loss": 1.0901, + "step": 2950 + }, + { + "epoch": 6.5777777777777775, + "grad_norm": 1.2570568323135376, + "learning_rate": 0.0002, + "loss": 1.0636, + "step": 2960 + }, + { + "epoch": 6.6, + "grad_norm": 1.2037729024887085, + "learning_rate": 0.0002, + "loss": 1.1093, + "step": 2970 + }, + { + "epoch": 6.622222222222222, + "grad_norm": 1.2894154787063599, + "learning_rate": 0.0002, + "loss": 1.0355, + "step": 2980 + }, + { + "epoch": 6.644444444444445, + "grad_norm": 1.1682062149047852, + "learning_rate": 0.0002, + "loss": 0.9846, + "step": 2990 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.6112759113311768, + "learning_rate": 0.0002, + "loss": 1.1292, + "step": 3000 + }, + { + "epoch": 6.688888888888889, + "grad_norm": 1.227586269378662, + "learning_rate": 0.0002, + "loss": 1.1831, + "step": 3010 + }, + { + "epoch": 6.711111111111111, + "grad_norm": 1.2558735609054565, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 3020 + }, + { + "epoch": 6.733333333333333, + "grad_norm": 1.2739307880401611, + "learning_rate": 0.0002, + "loss": 1.1151, + "step": 3030 + }, + { + "epoch": 6.7555555555555555, + "grad_norm": 1.2761014699935913, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 3040 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 1.308904767036438, + "learning_rate": 0.0002, + "loss": 1.0863, + "step": 3050 + }, + { + "epoch": 6.8, + "grad_norm": 1.6273704767227173, + "learning_rate": 0.0002, + "loss": 1.1072, + "step": 3060 + }, + { + "epoch": 6.822222222222222, + "grad_norm": 1.3006200790405273, + "learning_rate": 0.0002, + "loss": 1.0982, + "step": 3070 + }, + { + "epoch": 6.844444444444444, + "grad_norm": 1.2942757606506348, + "learning_rate": 0.0002, + "loss": 1.091, + "step": 3080 + }, + { + "epoch": 6.866666666666667, + "grad_norm": 1.3074650764465332, + "learning_rate": 0.0002, + "loss": 1.0371, + "step": 3090 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 1.321811556816101, + "learning_rate": 0.0002, + "loss": 1.0782, + "step": 3100 + }, + { + "epoch": 6.911111111111111, + "grad_norm": 1.0926110744476318, + "learning_rate": 0.0002, + "loss": 1.1375, + "step": 3110 + }, + { + "epoch": 6.933333333333334, + "grad_norm": 1.3839191198349, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 3120 + }, + { + "epoch": 6.955555555555556, + "grad_norm": 1.084396481513977, + "learning_rate": 0.0002, + "loss": 1.111, + "step": 3130 + }, + { + "epoch": 6.977777777777778, + "grad_norm": 1.262983798980713, + "learning_rate": 0.0002, + "loss": 1.0947, + "step": 3140 + }, + { + "epoch": 7.0, + "grad_norm": 1.1751209497451782, + "learning_rate": 0.0002, + "loss": 1.099, + "step": 3150 + }, + { + "epoch": 7.0, + "eval_loss": 2.2316300868988037, + "eval_runtime": 81.7348, + "eval_samples_per_second": 6.301, + "eval_steps_per_second": 0.795, + "step": 3150 + } + ], + "logging_steps": 10, + "max_steps": 3600, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.457749468643328e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08bcb50b9f534803ed2a4a4be696b74ae373982a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3150/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:333a3e3301276a753d48f5fabf4c169b8961c6e6ef7b7a7f931ba463d72983a4 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..371c755b4c79dc02d791c5fa8d7b45fe133ace94 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42e21562d073e8f0d254cbf703fed91c382cbd561560ec4075e679987a61792e +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fff7b7a4ccbca9826adcaa1d4df742176095027 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:289fed7dbe2c80dfe02faaa03404af10eed2da741fda8617a9ca38a036740cb9 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..eabb46c999001644e3324175d15eca60d22ecced --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01597fb94a72ec49a93caac109aafb21c8217dedba6aee769cf85dbb247288ef +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..97b7783071e50eedc87fd81d2399f7ff270fb0cc --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00b89c1fc038733b108cf7127176d971b616fd9df15772969e3fc0a788bf7fd1 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..175ba188272c150e06fe5626caad448dba4b6fcf --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/trainer_state.json @@ -0,0 +1,2617 @@ +{ + "best_metric": 1.8250652551651, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900", + "epoch": 8.0, + "eval_steps": 10, + "global_step": 3600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022222222222222223, + "grad_norm": 0.5109436511993408, + "learning_rate": 0.0002, + "loss": 2.5874, + "step": 10 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 0.4870035946369171, + "learning_rate": 0.0002, + "loss": 2.2952, + "step": 20 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.535464882850647, + "learning_rate": 0.0002, + "loss": 2.0554, + "step": 30 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.49077996611595154, + "learning_rate": 0.0002, + "loss": 2.0067, + "step": 40 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.4671357572078705, + "learning_rate": 0.0002, + "loss": 2.0673, + "step": 50 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.4970313608646393, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 60 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 0.4438260495662689, + "learning_rate": 0.0002, + "loss": 1.9419, + "step": 70 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.5089705586433411, + "learning_rate": 0.0002, + "loss": 1.9856, + "step": 80 + }, + { + "epoch": 0.2, + "grad_norm": 0.4645078182220459, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 90 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.46095192432403564, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 100 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 0.43338075280189514, + "learning_rate": 0.0002, + "loss": 1.891, + "step": 110 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.4433900713920593, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 120 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 0.7018499970436096, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 130 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 0.37056994438171387, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 140 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.40634623169898987, + "learning_rate": 0.0002, + "loss": 1.8059, + "step": 150 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.41917353868484497, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 160 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.42392489314079285, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 170 + }, + { + "epoch": 0.4, + "grad_norm": 0.4281010627746582, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 180 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 0.38542497158050537, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 190 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.36003032326698303, + "learning_rate": 0.0002, + "loss": 1.8951, + "step": 200 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.37858229875564575, + "learning_rate": 0.0002, + "loss": 1.8611, + "step": 210 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 0.49986031651496887, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 220 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 0.3937094807624817, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 230 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4566134512424469, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 240 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.3602476418018341, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 250 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 0.36321184039115906, + "learning_rate": 0.0002, + "loss": 1.7963, + "step": 260 + }, + { + "epoch": 0.6, + "grad_norm": 0.3808199167251587, + "learning_rate": 0.0002, + "loss": 1.9055, + "step": 270 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.38910621404647827, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 280 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 0.31913551688194275, + "learning_rate": 0.0002, + "loss": 1.7958, + "step": 290 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.34734025597572327, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 300 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 0.3517725467681885, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 310 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.3804526627063751, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 320 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.5592505931854248, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 330 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 0.36154472827911377, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 340 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.43970227241516113, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 350 + }, + { + "epoch": 0.8, + "grad_norm": 0.3525223731994629, + "learning_rate": 0.0002, + "loss": 1.8333, + "step": 360 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 0.3706997036933899, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 370 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 0.34138166904449463, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 380 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.4090622365474701, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 390 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.3729974031448364, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 400 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 0.3742152452468872, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 410 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.37685129046440125, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 420 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 430 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.31139856576919556, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 440 + }, + { + "epoch": 1.0, + "grad_norm": 0.3577502965927124, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8310279846191406, + "eval_runtime": 38.8374, + "eval_samples_per_second": 13.26, + "eval_steps_per_second": 1.674, + "step": 450 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 0.3257788419723511, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 460 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 0.3878970146179199, + "learning_rate": 0.0002, + "loss": 1.8031, + "step": 470 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.364427387714386, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 480 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 0.3374682664871216, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 490 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.35822123289108276, + "learning_rate": 0.0002, + "loss": 1.7308, + "step": 500 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.3748345673084259, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 510 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 0.3422437012195587, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 520 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 0.4289326071739197, + "learning_rate": 0.0002, + "loss": 1.7649, + "step": 530 + }, + { + "epoch": 1.2, + "grad_norm": 0.3706769645214081, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 540 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.4024733603000641, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 550 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 0.3960128128528595, + "learning_rate": 0.0002, + "loss": 1.7641, + "step": 560 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.38222864270210266, + "learning_rate": 0.0002, + "loss": 1.755, + "step": 570 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 0.4073713421821594, + "learning_rate": 0.0002, + "loss": 1.735, + "step": 580 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 0.3875499963760376, + "learning_rate": 0.0002, + "loss": 1.7183, + "step": 590 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.39740806818008423, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 600 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.38432490825653076, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 610 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 0.402729868888855, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 620 + }, + { + "epoch": 1.4, + "grad_norm": 0.36683231592178345, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 630 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 0.3883286714553833, + "learning_rate": 0.0002, + "loss": 1.8059, + "step": 640 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4087409973144531, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 650 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.4042017459869385, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 660 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 0.40149256587028503, + "learning_rate": 0.0002, + "loss": 1.7466, + "step": 670 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 0.45146510004997253, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 680 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.4098089039325714, + "learning_rate": 0.0002, + "loss": 1.6923, + "step": 690 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.4181336760520935, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 700 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 1.3722974061965942, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 710 + }, + { + "epoch": 1.6, + "grad_norm": 0.3965230882167816, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 720 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 0.3842000663280487, + "learning_rate": 0.0002, + "loss": 1.7782, + "step": 730 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 0.3603688180446625, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 740 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.39973509311676025, + "learning_rate": 0.0002, + "loss": 1.7476, + "step": 750 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 0.3687385618686676, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 760 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 0.4267722964286804, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 770 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.41301295161247253, + "learning_rate": 0.0002, + "loss": 1.8041, + "step": 780 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 0.3945430517196655, + "learning_rate": 0.0002, + "loss": 1.7284, + "step": 790 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.4037930965423584, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 800 + }, + { + "epoch": 1.8, + "grad_norm": 0.406893253326416, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 810 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 0.4600457549095154, + "learning_rate": 0.0002, + "loss": 1.7628, + "step": 820 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 0.4195384085178375, + "learning_rate": 0.0002, + "loss": 1.8222, + "step": 830 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.3854130506515503, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 840 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.38279038667678833, + "learning_rate": 0.0002, + "loss": 1.7227, + "step": 850 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 0.38249439001083374, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 860 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.42977792024612427, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 870 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 0.4109351933002472, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 880 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 0.3734486699104309, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 890 + }, + { + "epoch": 2.0, + "grad_norm": 0.3603087067604065, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 900 + }, + { + "epoch": 2.0, + "eval_loss": 1.8250652551651, + "eval_runtime": 38.8657, + "eval_samples_per_second": 13.251, + "eval_steps_per_second": 1.672, + "step": 900 + }, + { + "epoch": 2.022222222222222, + "grad_norm": 0.4014144241809845, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 910 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 0.4338063597679138, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 920 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 0.3693605065345764, + "learning_rate": 0.0002, + "loss": 1.6354, + "step": 930 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 0.4040255844593048, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 940 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.43481820821762085, + "learning_rate": 0.0002, + "loss": 1.5976, + "step": 950 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.41632869839668274, + "learning_rate": 0.0002, + "loss": 1.6081, + "step": 960 + }, + { + "epoch": 2.1555555555555554, + "grad_norm": 0.4633755385875702, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 970 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 0.43926581740379333, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 980 + }, + { + "epoch": 2.2, + "grad_norm": 0.4757233262062073, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 990 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5010586977005005, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 1000 + }, + { + "epoch": 2.2444444444444445, + "grad_norm": 0.44900986552238464, + "learning_rate": 0.0002, + "loss": 1.6454, + "step": 1010 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.41274750232696533, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 1020 + }, + { + "epoch": 2.2888888888888888, + "grad_norm": 0.44672393798828125, + "learning_rate": 0.0002, + "loss": 1.6592, + "step": 1030 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 0.4826269745826721, + "learning_rate": 0.0002, + "loss": 1.6349, + "step": 1040 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.4650685489177704, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 1050 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 0.42507848143577576, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 1060 + }, + { + "epoch": 2.3777777777777778, + "grad_norm": 0.45653030276298523, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1070 + }, + { + "epoch": 2.4, + "grad_norm": 0.44534122943878174, + "learning_rate": 0.0002, + "loss": 1.6469, + "step": 1080 + }, + { + "epoch": 2.422222222222222, + "grad_norm": 0.4241289794445038, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 1090 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.5004808306694031, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 1100 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 0.41425490379333496, + "learning_rate": 0.0002, + "loss": 1.6833, + "step": 1110 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 0.44362279772758484, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1120 + }, + { + "epoch": 2.511111111111111, + "grad_norm": 0.5530985593795776, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 1130 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.4290637969970703, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1140 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.4957487881183624, + "learning_rate": 0.0002, + "loss": 1.76, + "step": 1150 + }, + { + "epoch": 2.5777777777777775, + "grad_norm": 0.5082747340202332, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 1160 + }, + { + "epoch": 2.6, + "grad_norm": 0.478722482919693, + "learning_rate": 0.0002, + "loss": 1.6702, + "step": 1170 + }, + { + "epoch": 2.6222222222222222, + "grad_norm": 0.436454176902771, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 1180 + }, + { + "epoch": 2.6444444444444444, + "grad_norm": 0.4905032515525818, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 1190 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.4815700054168701, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 1200 + }, + { + "epoch": 2.688888888888889, + "grad_norm": 0.3965534269809723, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 1210 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 0.43282169103622437, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 1220 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 0.45512479543685913, + "learning_rate": 0.0002, + "loss": 1.6455, + "step": 1230 + }, + { + "epoch": 2.7555555555555555, + "grad_norm": 0.44370076060295105, + "learning_rate": 0.0002, + "loss": 1.6444, + "step": 1240 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.4750686287879944, + "learning_rate": 0.0002, + "loss": 1.7106, + "step": 1250 + }, + { + "epoch": 2.8, + "grad_norm": 0.41953766345977783, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 1260 + }, + { + "epoch": 2.822222222222222, + "grad_norm": 0.4887140095233917, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 1270 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 0.46718958020210266, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 1280 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 0.48510900139808655, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 1290 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.4504084289073944, + "learning_rate": 0.0002, + "loss": 1.6825, + "step": 1300 + }, + { + "epoch": 2.911111111111111, + "grad_norm": 0.42119622230529785, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 1310 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 0.4763694107532501, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 1320 + }, + { + "epoch": 2.9555555555555557, + "grad_norm": 0.422810822725296, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1330 + }, + { + "epoch": 2.977777777777778, + "grad_norm": 0.4768871068954468, + "learning_rate": 0.0002, + "loss": 1.6578, + "step": 1340 + }, + { + "epoch": 3.0, + "grad_norm": 0.48259881138801575, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 1350 + }, + { + "epoch": 3.0, + "eval_loss": 1.8452560901641846, + "eval_runtime": 38.8621, + "eval_samples_per_second": 13.252, + "eval_steps_per_second": 1.673, + "step": 1350 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 0.6933313012123108, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 1360 + }, + { + "epoch": 3.0444444444444443, + "grad_norm": 0.5870710611343384, + "learning_rate": 0.0002, + "loss": 1.5542, + "step": 1370 + }, + { + "epoch": 3.066666666666667, + "grad_norm": 0.602210283279419, + "learning_rate": 0.0002, + "loss": 1.511, + "step": 1380 + }, + { + "epoch": 3.088888888888889, + "grad_norm": 0.6461787819862366, + "learning_rate": 0.0002, + "loss": 1.5272, + "step": 1390 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.5839587450027466, + "learning_rate": 0.0002, + "loss": 1.4813, + "step": 1400 + }, + { + "epoch": 3.1333333333333333, + "grad_norm": 0.5757876038551331, + "learning_rate": 0.0002, + "loss": 1.505, + "step": 1410 + }, + { + "epoch": 3.1555555555555554, + "grad_norm": 0.5862616300582886, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 1420 + }, + { + "epoch": 3.1777777777777776, + "grad_norm": 0.6103630065917969, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 1430 + }, + { + "epoch": 3.2, + "grad_norm": 0.9309254884719849, + "learning_rate": 0.0002, + "loss": 1.5406, + "step": 1440 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.5360018014907837, + "learning_rate": 0.0002, + "loss": 1.487, + "step": 1450 + }, + { + "epoch": 3.2444444444444445, + "grad_norm": 0.5448758602142334, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 1460 + }, + { + "epoch": 3.2666666666666666, + "grad_norm": 0.5973812341690063, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 1470 + }, + { + "epoch": 3.2888888888888888, + "grad_norm": 0.6245622038841248, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 1480 + }, + { + "epoch": 3.311111111111111, + "grad_norm": 0.6533768773078918, + "learning_rate": 0.0002, + "loss": 1.4795, + "step": 1490 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.5765811204910278, + "learning_rate": 0.0002, + "loss": 1.5562, + "step": 1500 + }, + { + "epoch": 3.3555555555555556, + "grad_norm": 0.591395378112793, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 1510 + }, + { + "epoch": 3.3777777777777778, + "grad_norm": 0.5842425227165222, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 1520 + }, + { + "epoch": 3.4, + "grad_norm": 0.5731365084648132, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 1530 + }, + { + "epoch": 3.422222222222222, + "grad_norm": 0.5841306447982788, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 1540 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.6503536701202393, + "learning_rate": 0.0002, + "loss": 1.4922, + "step": 1550 + }, + { + "epoch": 3.466666666666667, + "grad_norm": 0.6170967221260071, + "learning_rate": 0.0002, + "loss": 1.5493, + "step": 1560 + }, + { + "epoch": 3.488888888888889, + "grad_norm": 0.5576487183570862, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 1570 + }, + { + "epoch": 3.511111111111111, + "grad_norm": 0.7082911133766174, + "learning_rate": 0.0002, + "loss": 1.472, + "step": 1580 + }, + { + "epoch": 3.533333333333333, + "grad_norm": 0.6159376502037048, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 1590 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.5972959399223328, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 1600 + }, + { + "epoch": 3.5777777777777775, + "grad_norm": 0.5787310004234314, + "learning_rate": 0.0002, + "loss": 1.4876, + "step": 1610 + }, + { + "epoch": 3.6, + "grad_norm": 0.5846341252326965, + "learning_rate": 0.0002, + "loss": 1.4887, + "step": 1620 + }, + { + "epoch": 3.6222222222222222, + "grad_norm": 0.5906197428703308, + "learning_rate": 0.0002, + "loss": 1.542, + "step": 1630 + }, + { + "epoch": 3.6444444444444444, + "grad_norm": 0.6305760145187378, + "learning_rate": 0.0002, + "loss": 1.4941, + "step": 1640 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.7448979616165161, + "learning_rate": 0.0002, + "loss": 1.4677, + "step": 1650 + }, + { + "epoch": 3.688888888888889, + "grad_norm": 0.5906165242195129, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 1660 + }, + { + "epoch": 3.7111111111111112, + "grad_norm": 0.605032742023468, + "learning_rate": 0.0002, + "loss": 1.4882, + "step": 1670 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 0.6117229461669922, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 1680 + }, + { + "epoch": 3.7555555555555555, + "grad_norm": 0.613581120967865, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 1690 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.6244436502456665, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 1700 + }, + { + "epoch": 3.8, + "grad_norm": 0.6236702799797058, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 1710 + }, + { + "epoch": 3.822222222222222, + "grad_norm": 0.639141857624054, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 1720 + }, + { + "epoch": 3.8444444444444446, + "grad_norm": 0.5782344937324524, + "learning_rate": 0.0002, + "loss": 1.536, + "step": 1730 + }, + { + "epoch": 3.8666666666666667, + "grad_norm": 0.5952938795089722, + "learning_rate": 0.0002, + "loss": 1.5355, + "step": 1740 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.5573042035102844, + "learning_rate": 0.0002, + "loss": 1.5205, + "step": 1750 + }, + { + "epoch": 3.911111111111111, + "grad_norm": 0.6114351749420166, + "learning_rate": 0.0002, + "loss": 1.5066, + "step": 1760 + }, + { + "epoch": 3.9333333333333336, + "grad_norm": 0.5973817110061646, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 1770 + }, + { + "epoch": 3.9555555555555557, + "grad_norm": 0.602317750453949, + "learning_rate": 0.0002, + "loss": 1.5003, + "step": 1780 + }, + { + "epoch": 3.977777777777778, + "grad_norm": 0.5965437293052673, + "learning_rate": 0.0002, + "loss": 1.5022, + "step": 1790 + }, + { + "epoch": 4.0, + "grad_norm": 0.5641552209854126, + "learning_rate": 0.0002, + "loss": 1.5031, + "step": 1800 + }, + { + "epoch": 4.0, + "eval_loss": 1.892098069190979, + "eval_runtime": 38.8755, + "eval_samples_per_second": 13.247, + "eval_steps_per_second": 1.672, + "step": 1800 + }, + { + "epoch": 4.022222222222222, + "grad_norm": 0.8302594423294067, + "learning_rate": 0.0002, + "loss": 1.3894, + "step": 1810 + }, + { + "epoch": 4.044444444444444, + "grad_norm": 0.6695230603218079, + "learning_rate": 0.0002, + "loss": 1.3727, + "step": 1820 + }, + { + "epoch": 4.066666666666666, + "grad_norm": 0.7911471128463745, + "learning_rate": 0.0002, + "loss": 1.3064, + "step": 1830 + }, + { + "epoch": 4.088888888888889, + "grad_norm": 0.7044888138771057, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 1840 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.7057249546051025, + "learning_rate": 0.0002, + "loss": 1.3941, + "step": 1850 + }, + { + "epoch": 4.133333333333334, + "grad_norm": 0.8762815594673157, + "learning_rate": 0.0002, + "loss": 1.4052, + "step": 1860 + }, + { + "epoch": 4.155555555555556, + "grad_norm": 0.7619158029556274, + "learning_rate": 0.0002, + "loss": 1.3784, + "step": 1870 + }, + { + "epoch": 4.177777777777778, + "grad_norm": 0.7711658477783203, + "learning_rate": 0.0002, + "loss": 1.3581, + "step": 1880 + }, + { + "epoch": 4.2, + "grad_norm": 0.9732598662376404, + "learning_rate": 0.0002, + "loss": 1.3995, + "step": 1890 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.9070265889167786, + "learning_rate": 0.0002, + "loss": 1.3353, + "step": 1900 + }, + { + "epoch": 4.2444444444444445, + "grad_norm": 0.8274767994880676, + "learning_rate": 0.0002, + "loss": 1.3947, + "step": 1910 + }, + { + "epoch": 4.266666666666667, + "grad_norm": 0.8514227271080017, + "learning_rate": 0.0002, + "loss": 1.3392, + "step": 1920 + }, + { + "epoch": 4.288888888888889, + "grad_norm": 0.7356534600257874, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 1930 + }, + { + "epoch": 4.311111111111111, + "grad_norm": 0.8226608037948608, + "learning_rate": 0.0002, + "loss": 1.3708, + "step": 1940 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.8347907066345215, + "learning_rate": 0.0002, + "loss": 1.3652, + "step": 1950 + }, + { + "epoch": 4.355555555555555, + "grad_norm": 0.8509323000907898, + "learning_rate": 0.0002, + "loss": 1.3415, + "step": 1960 + }, + { + "epoch": 4.377777777777778, + "grad_norm": 0.8776063323020935, + "learning_rate": 0.0002, + "loss": 1.3796, + "step": 1970 + }, + { + "epoch": 4.4, + "grad_norm": 0.8022271990776062, + "learning_rate": 0.0002, + "loss": 1.438, + "step": 1980 + }, + { + "epoch": 4.4222222222222225, + "grad_norm": 0.7984752058982849, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 1990 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.7349720001220703, + "learning_rate": 0.0002, + "loss": 1.4214, + "step": 2000 + }, + { + "epoch": 4.466666666666667, + "grad_norm": 0.7778817415237427, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 2010 + }, + { + "epoch": 4.488888888888889, + "grad_norm": 0.9361467361450195, + "learning_rate": 0.0002, + "loss": 1.3365, + "step": 2020 + }, + { + "epoch": 4.511111111111111, + "grad_norm": 0.7839348912239075, + "learning_rate": 0.0002, + "loss": 1.4129, + "step": 2030 + }, + { + "epoch": 4.533333333333333, + "grad_norm": 0.8361981511116028, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 2040 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 1.9877147674560547, + "learning_rate": 0.0002, + "loss": 1.4085, + "step": 2050 + }, + { + "epoch": 4.5777777777777775, + "grad_norm": 0.7506140470504761, + "learning_rate": 0.0002, + "loss": 1.329, + "step": 2060 + }, + { + "epoch": 4.6, + "grad_norm": 0.9493570327758789, + "learning_rate": 0.0002, + "loss": 1.3557, + "step": 2070 + }, + { + "epoch": 4.622222222222222, + "grad_norm": 0.7198925018310547, + "learning_rate": 0.0002, + "loss": 1.438, + "step": 2080 + }, + { + "epoch": 4.644444444444445, + "grad_norm": 0.7521472573280334, + "learning_rate": 0.0002, + "loss": 1.3892, + "step": 2090 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.766718327999115, + "learning_rate": 0.0002, + "loss": 1.3833, + "step": 2100 + }, + { + "epoch": 4.688888888888889, + "grad_norm": 0.9162390232086182, + "learning_rate": 0.0002, + "loss": 1.3541, + "step": 2110 + }, + { + "epoch": 4.711111111111111, + "grad_norm": 0.8980328440666199, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 2120 + }, + { + "epoch": 4.733333333333333, + "grad_norm": 0.8109711408615112, + "learning_rate": 0.0002, + "loss": 1.4043, + "step": 2130 + }, + { + "epoch": 4.7555555555555555, + "grad_norm": 0.7372606992721558, + "learning_rate": 0.0002, + "loss": 1.373, + "step": 2140 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.7527457475662231, + "learning_rate": 0.0002, + "loss": 1.4439, + "step": 2150 + }, + { + "epoch": 4.8, + "grad_norm": 1.0380001068115234, + "learning_rate": 0.0002, + "loss": 1.2999, + "step": 2160 + }, + { + "epoch": 4.822222222222222, + "grad_norm": 0.7166368365287781, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 2170 + }, + { + "epoch": 4.844444444444444, + "grad_norm": 0.784548282623291, + "learning_rate": 0.0002, + "loss": 1.3917, + "step": 2180 + }, + { + "epoch": 4.866666666666667, + "grad_norm": 0.7771317958831787, + "learning_rate": 0.0002, + "loss": 1.3376, + "step": 2190 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.7710300087928772, + "learning_rate": 0.0002, + "loss": 1.3315, + "step": 2200 + }, + { + "epoch": 4.911111111111111, + "grad_norm": 0.7715084552764893, + "learning_rate": 0.0002, + "loss": 1.3676, + "step": 2210 + }, + { + "epoch": 4.933333333333334, + "grad_norm": 0.7888006567955017, + "learning_rate": 0.0002, + "loss": 1.5352, + "step": 2220 + }, + { + "epoch": 4.955555555555556, + "grad_norm": 0.800684928894043, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 2230 + }, + { + "epoch": 4.977777777777778, + "grad_norm": 0.7710039019584656, + "learning_rate": 0.0002, + "loss": 1.4343, + "step": 2240 + }, + { + "epoch": 5.0, + "grad_norm": 0.8617033958435059, + "learning_rate": 0.0002, + "loss": 1.3501, + "step": 2250 + }, + { + "epoch": 5.0, + "eval_loss": 1.9718151092529297, + "eval_runtime": 38.8999, + "eval_samples_per_second": 13.239, + "eval_steps_per_second": 1.671, + "step": 2250 + }, + { + "epoch": 5.022222222222222, + "grad_norm": 1.07399582862854, + "learning_rate": 0.0002, + "loss": 1.19, + "step": 2260 + }, + { + "epoch": 5.044444444444444, + "grad_norm": 0.6598460674285889, + "learning_rate": 0.0002, + "loss": 1.2299, + "step": 2270 + }, + { + "epoch": 5.066666666666666, + "grad_norm": 1.1039506196975708, + "learning_rate": 0.0002, + "loss": 1.2333, + "step": 2280 + }, + { + "epoch": 5.088888888888889, + "grad_norm": 1.0624054670333862, + "learning_rate": 0.0002, + "loss": 1.2412, + "step": 2290 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 0.849583625793457, + "learning_rate": 0.0002, + "loss": 1.184, + "step": 2300 + }, + { + "epoch": 5.133333333333334, + "grad_norm": 1.0143699645996094, + "learning_rate": 0.0002, + "loss": 1.1884, + "step": 2310 + }, + { + "epoch": 5.155555555555556, + "grad_norm": 0.8990702629089355, + "learning_rate": 0.0002, + "loss": 1.2133, + "step": 2320 + }, + { + "epoch": 5.177777777777778, + "grad_norm": 0.9822764992713928, + "learning_rate": 0.0002, + "loss": 1.2091, + "step": 2330 + }, + { + "epoch": 5.2, + "grad_norm": 0.9632459282875061, + "learning_rate": 0.0002, + "loss": 1.1775, + "step": 2340 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 1.0897903442382812, + "learning_rate": 0.0002, + "loss": 1.1821, + "step": 2350 + }, + { + "epoch": 5.2444444444444445, + "grad_norm": 1.155950665473938, + "learning_rate": 0.0002, + "loss": 1.2976, + "step": 2360 + }, + { + "epoch": 5.266666666666667, + "grad_norm": 1.0566821098327637, + "learning_rate": 0.0002, + "loss": 1.1662, + "step": 2370 + }, + { + "epoch": 5.288888888888889, + "grad_norm": 1.191604733467102, + "learning_rate": 0.0002, + "loss": 1.2809, + "step": 2380 + }, + { + "epoch": 5.311111111111111, + "grad_norm": 0.852453887462616, + "learning_rate": 0.0002, + "loss": 1.2431, + "step": 2390 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.9649669528007507, + "learning_rate": 0.0002, + "loss": 1.2106, + "step": 2400 + }, + { + "epoch": 5.355555555555555, + "grad_norm": 1.0731003284454346, + "learning_rate": 0.0002, + "loss": 1.2433, + "step": 2410 + }, + { + "epoch": 5.377777777777778, + "grad_norm": 0.9628495573997498, + "learning_rate": 0.0002, + "loss": 1.1737, + "step": 2420 + }, + { + "epoch": 5.4, + "grad_norm": 0.9268819093704224, + "learning_rate": 0.0002, + "loss": 1.3166, + "step": 2430 + }, + { + "epoch": 5.4222222222222225, + "grad_norm": 1.1104000806808472, + "learning_rate": 0.0002, + "loss": 1.2114, + "step": 2440 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 1.0439373254776, + "learning_rate": 0.0002, + "loss": 1.2151, + "step": 2450 + }, + { + "epoch": 5.466666666666667, + "grad_norm": 1.0366657972335815, + "learning_rate": 0.0002, + "loss": 1.2458, + "step": 2460 + }, + { + "epoch": 5.488888888888889, + "grad_norm": 1.0604808330535889, + "learning_rate": 0.0002, + "loss": 1.2021, + "step": 2470 + }, + { + "epoch": 5.511111111111111, + "grad_norm": 0.8845253586769104, + "learning_rate": 0.0002, + "loss": 1.2188, + "step": 2480 + }, + { + "epoch": 5.533333333333333, + "grad_norm": 0.8200256824493408, + "learning_rate": 0.0002, + "loss": 1.2296, + "step": 2490 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.9628723859786987, + "learning_rate": 0.0002, + "loss": 1.2632, + "step": 2500 + }, + { + "epoch": 5.5777777777777775, + "grad_norm": 1.0758650302886963, + "learning_rate": 0.0002, + "loss": 1.2723, + "step": 2510 + }, + { + "epoch": 5.6, + "grad_norm": 1.0113487243652344, + "learning_rate": 0.0002, + "loss": 1.2298, + "step": 2520 + }, + { + "epoch": 5.622222222222222, + "grad_norm": 1.260536551475525, + "learning_rate": 0.0002, + "loss": 1.2226, + "step": 2530 + }, + { + "epoch": 5.644444444444445, + "grad_norm": 0.9229527115821838, + "learning_rate": 0.0002, + "loss": 1.227, + "step": 2540 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.9378697276115417, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 2550 + }, + { + "epoch": 5.688888888888889, + "grad_norm": 1.0404350757598877, + "learning_rate": 0.0002, + "loss": 1.2759, + "step": 2560 + }, + { + "epoch": 5.711111111111111, + "grad_norm": 1.1879961490631104, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 2570 + }, + { + "epoch": 5.733333333333333, + "grad_norm": 0.8881482481956482, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 2580 + }, + { + "epoch": 5.7555555555555555, + "grad_norm": 1.1428065299987793, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 2590 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.8970609903335571, + "learning_rate": 0.0002, + "loss": 1.2682, + "step": 2600 + }, + { + "epoch": 5.8, + "grad_norm": 1.2084497213363647, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 2610 + }, + { + "epoch": 5.822222222222222, + "grad_norm": 1.04214608669281, + "learning_rate": 0.0002, + "loss": 1.2004, + "step": 2620 + }, + { + "epoch": 5.844444444444444, + "grad_norm": 1.0671849250793457, + "learning_rate": 0.0002, + "loss": 1.2388, + "step": 2630 + }, + { + "epoch": 5.866666666666667, + "grad_norm": 1.009602427482605, + "learning_rate": 0.0002, + "loss": 1.1714, + "step": 2640 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.9787904024124146, + "learning_rate": 0.0002, + "loss": 1.2292, + "step": 2650 + }, + { + "epoch": 5.911111111111111, + "grad_norm": 1.0043761730194092, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 2660 + }, + { + "epoch": 5.933333333333334, + "grad_norm": 0.9855443239212036, + "learning_rate": 0.0002, + "loss": 1.2712, + "step": 2670 + }, + { + "epoch": 5.955555555555556, + "grad_norm": 1.1488507986068726, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 2680 + }, + { + "epoch": 5.977777777777778, + "grad_norm": 0.9939966797828674, + "learning_rate": 0.0002, + "loss": 1.2576, + "step": 2690 + }, + { + "epoch": 6.0, + "grad_norm": 1.0444952249526978, + "learning_rate": 0.0002, + "loss": 1.2847, + "step": 2700 + }, + { + "epoch": 6.0, + "eval_loss": 2.0881619453430176, + "eval_runtime": 39.6891, + "eval_samples_per_second": 12.976, + "eval_steps_per_second": 1.638, + "step": 2700 + }, + { + "epoch": 6.022222222222222, + "grad_norm": 1.3728636503219604, + "learning_rate": 0.0002, + "loss": 1.0764, + "step": 2710 + }, + { + "epoch": 6.044444444444444, + "grad_norm": 1.06633460521698, + "learning_rate": 0.0002, + "loss": 1.0778, + "step": 2720 + }, + { + "epoch": 6.066666666666666, + "grad_norm": 1.2068440914154053, + "learning_rate": 0.0002, + "loss": 1.0181, + "step": 2730 + }, + { + "epoch": 6.088888888888889, + "grad_norm": 1.248744010925293, + "learning_rate": 0.0002, + "loss": 1.0225, + "step": 2740 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 1.1814687252044678, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 2750 + }, + { + "epoch": 6.133333333333334, + "grad_norm": 1.2335790395736694, + "learning_rate": 0.0002, + "loss": 0.973, + "step": 2760 + }, + { + "epoch": 6.155555555555556, + "grad_norm": 1.0661171674728394, + "learning_rate": 0.0002, + "loss": 1.0193, + "step": 2770 + }, + { + "epoch": 6.177777777777778, + "grad_norm": 1.345876932144165, + "learning_rate": 0.0002, + "loss": 1.0496, + "step": 2780 + }, + { + "epoch": 6.2, + "grad_norm": 1.2426252365112305, + "learning_rate": 0.0002, + "loss": 1.0252, + "step": 2790 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 1.1970592737197876, + "learning_rate": 0.0002, + "loss": 1.0075, + "step": 2800 + }, + { + "epoch": 6.2444444444444445, + "grad_norm": 1.2484612464904785, + "learning_rate": 0.0002, + "loss": 1.1016, + "step": 2810 + }, + { + "epoch": 6.266666666666667, + "grad_norm": 1.2115106582641602, + "learning_rate": 0.0002, + "loss": 1.0032, + "step": 2820 + }, + { + "epoch": 6.288888888888889, + "grad_norm": 1.0024933815002441, + "learning_rate": 0.0002, + "loss": 1.0721, + "step": 2830 + }, + { + "epoch": 6.311111111111111, + "grad_norm": 1.1508114337921143, + "learning_rate": 0.0002, + "loss": 1.0705, + "step": 2840 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 1.1686254739761353, + "learning_rate": 0.0002, + "loss": 1.0632, + "step": 2850 + }, + { + "epoch": 6.355555555555555, + "grad_norm": 1.2702640295028687, + "learning_rate": 0.0002, + "loss": 1.1031, + "step": 2860 + }, + { + "epoch": 6.377777777777778, + "grad_norm": 1.3344615697860718, + "learning_rate": 0.0002, + "loss": 1.1033, + "step": 2870 + }, + { + "epoch": 6.4, + "grad_norm": 1.27545964717865, + "learning_rate": 0.0002, + "loss": 1.1105, + "step": 2880 + }, + { + "epoch": 6.4222222222222225, + "grad_norm": 1.2365739345550537, + "learning_rate": 0.0002, + "loss": 1.0353, + "step": 2890 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 1.3821545839309692, + "learning_rate": 0.0002, + "loss": 1.046, + "step": 2900 + }, + { + "epoch": 6.466666666666667, + "grad_norm": 1.1889359951019287, + "learning_rate": 0.0002, + "loss": 1.0643, + "step": 2910 + }, + { + "epoch": 6.488888888888889, + "grad_norm": 1.1324981451034546, + "learning_rate": 0.0002, + "loss": 1.0173, + "step": 2920 + }, + { + "epoch": 6.511111111111111, + "grad_norm": 1.154468297958374, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 2930 + }, + { + "epoch": 6.533333333333333, + "grad_norm": 1.211300253868103, + "learning_rate": 0.0002, + "loss": 1.1323, + "step": 2940 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 1.3322433233261108, + "learning_rate": 0.0002, + "loss": 1.0901, + "step": 2950 + }, + { + "epoch": 6.5777777777777775, + "grad_norm": 1.2570568323135376, + "learning_rate": 0.0002, + "loss": 1.0636, + "step": 2960 + }, + { + "epoch": 6.6, + "grad_norm": 1.2037729024887085, + "learning_rate": 0.0002, + "loss": 1.1093, + "step": 2970 + }, + { + "epoch": 6.622222222222222, + "grad_norm": 1.2894154787063599, + "learning_rate": 0.0002, + "loss": 1.0355, + "step": 2980 + }, + { + "epoch": 6.644444444444445, + "grad_norm": 1.1682062149047852, + "learning_rate": 0.0002, + "loss": 0.9846, + "step": 2990 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.6112759113311768, + "learning_rate": 0.0002, + "loss": 1.1292, + "step": 3000 + }, + { + "epoch": 6.688888888888889, + "grad_norm": 1.227586269378662, + "learning_rate": 0.0002, + "loss": 1.1831, + "step": 3010 + }, + { + "epoch": 6.711111111111111, + "grad_norm": 1.2558735609054565, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 3020 + }, + { + "epoch": 6.733333333333333, + "grad_norm": 1.2739307880401611, + "learning_rate": 0.0002, + "loss": 1.1151, + "step": 3030 + }, + { + "epoch": 6.7555555555555555, + "grad_norm": 1.2761014699935913, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 3040 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 1.308904767036438, + "learning_rate": 0.0002, + "loss": 1.0863, + "step": 3050 + }, + { + "epoch": 6.8, + "grad_norm": 1.6273704767227173, + "learning_rate": 0.0002, + "loss": 1.1072, + "step": 3060 + }, + { + "epoch": 6.822222222222222, + "grad_norm": 1.3006200790405273, + "learning_rate": 0.0002, + "loss": 1.0982, + "step": 3070 + }, + { + "epoch": 6.844444444444444, + "grad_norm": 1.2942757606506348, + "learning_rate": 0.0002, + "loss": 1.091, + "step": 3080 + }, + { + "epoch": 6.866666666666667, + "grad_norm": 1.3074650764465332, + "learning_rate": 0.0002, + "loss": 1.0371, + "step": 3090 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 1.321811556816101, + "learning_rate": 0.0002, + "loss": 1.0782, + "step": 3100 + }, + { + "epoch": 6.911111111111111, + "grad_norm": 1.0926110744476318, + "learning_rate": 0.0002, + "loss": 1.1375, + "step": 3110 + }, + { + "epoch": 6.933333333333334, + "grad_norm": 1.3839191198349, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 3120 + }, + { + "epoch": 6.955555555555556, + "grad_norm": 1.084396481513977, + "learning_rate": 0.0002, + "loss": 1.111, + "step": 3130 + }, + { + "epoch": 6.977777777777778, + "grad_norm": 1.262983798980713, + "learning_rate": 0.0002, + "loss": 1.0947, + "step": 3140 + }, + { + "epoch": 7.0, + "grad_norm": 1.1751209497451782, + "learning_rate": 0.0002, + "loss": 1.099, + "step": 3150 + }, + { + "epoch": 7.0, + "eval_loss": 2.2316300868988037, + "eval_runtime": 81.7348, + "eval_samples_per_second": 6.301, + "eval_steps_per_second": 0.795, + "step": 3150 + }, + { + "epoch": 7.022222222222222, + "grad_norm": 1.7097322940826416, + "learning_rate": 0.0002, + "loss": 0.9085, + "step": 3160 + }, + { + "epoch": 7.044444444444444, + "grad_norm": 1.287734031677246, + "learning_rate": 0.0002, + "loss": 0.8524, + "step": 3170 + }, + { + "epoch": 7.066666666666666, + "grad_norm": 1.680770993232727, + "learning_rate": 0.0002, + "loss": 0.9244, + "step": 3180 + }, + { + "epoch": 7.088888888888889, + "grad_norm": 1.3358803987503052, + "learning_rate": 0.0002, + "loss": 0.8847, + "step": 3190 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 1.5450502634048462, + "learning_rate": 0.0002, + "loss": 0.9036, + "step": 3200 + }, + { + "epoch": 7.133333333333334, + "grad_norm": 1.5816127061843872, + "learning_rate": 0.0002, + "loss": 0.8995, + "step": 3210 + }, + { + "epoch": 7.155555555555556, + "grad_norm": 1.4042329788208008, + "learning_rate": 0.0002, + "loss": 0.8622, + "step": 3220 + }, + { + "epoch": 7.177777777777778, + "grad_norm": 1.3045488595962524, + "learning_rate": 0.0002, + "loss": 0.9034, + "step": 3230 + }, + { + "epoch": 7.2, + "grad_norm": 1.4329142570495605, + "learning_rate": 0.0002, + "loss": 0.8673, + "step": 3240 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 1.4555209875106812, + "learning_rate": 0.0002, + "loss": 0.8642, + "step": 3250 + }, + { + "epoch": 7.2444444444444445, + "grad_norm": 1.4156484603881836, + "learning_rate": 0.0002, + "loss": 0.8753, + "step": 3260 + }, + { + "epoch": 7.266666666666667, + "grad_norm": 1.3839219808578491, + "learning_rate": 0.0002, + "loss": 0.9189, + "step": 3270 + }, + { + "epoch": 7.288888888888889, + "grad_norm": 1.409365177154541, + "learning_rate": 0.0002, + "loss": 0.9091, + "step": 3280 + }, + { + "epoch": 7.311111111111111, + "grad_norm": 1.3349004983901978, + "learning_rate": 0.0002, + "loss": 0.8671, + "step": 3290 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 1.602988839149475, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 3300 + }, + { + "epoch": 7.355555555555555, + "grad_norm": 1.492713451385498, + "learning_rate": 0.0002, + "loss": 0.8603, + "step": 3310 + }, + { + "epoch": 7.377777777777778, + "grad_norm": 1.4347516298294067, + "learning_rate": 0.0002, + "loss": 0.8906, + "step": 3320 + }, + { + "epoch": 7.4, + "grad_norm": 1.5181629657745361, + "learning_rate": 0.0002, + "loss": 0.9412, + "step": 3330 + }, + { + "epoch": 7.4222222222222225, + "grad_norm": 1.339322805404663, + "learning_rate": 0.0002, + "loss": 0.8748, + "step": 3340 + }, + { + "epoch": 7.444444444444445, + "grad_norm": 1.6582218408584595, + "learning_rate": 0.0002, + "loss": 0.9323, + "step": 3350 + }, + { + "epoch": 7.466666666666667, + "grad_norm": 1.3226500749588013, + "learning_rate": 0.0002, + "loss": 0.8823, + "step": 3360 + }, + { + "epoch": 7.488888888888889, + "grad_norm": 1.6935880184173584, + "learning_rate": 0.0002, + "loss": 0.9468, + "step": 3370 + }, + { + "epoch": 7.511111111111111, + "grad_norm": 1.2704429626464844, + "learning_rate": 0.0002, + "loss": 0.9078, + "step": 3380 + }, + { + "epoch": 7.533333333333333, + "grad_norm": 1.4228342771530151, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 3390 + }, + { + "epoch": 7.555555555555555, + "grad_norm": 1.8575019836425781, + "learning_rate": 0.0002, + "loss": 0.9053, + "step": 3400 + }, + { + "epoch": 7.5777777777777775, + "grad_norm": 1.4379228353500366, + "learning_rate": 0.0002, + "loss": 0.9372, + "step": 3410 + }, + { + "epoch": 7.6, + "grad_norm": 1.4535613059997559, + "learning_rate": 0.0002, + "loss": 0.9009, + "step": 3420 + }, + { + "epoch": 7.622222222222222, + "grad_norm": 1.485689401626587, + "learning_rate": 0.0002, + "loss": 0.9669, + "step": 3430 + }, + { + "epoch": 7.644444444444445, + "grad_norm": 1.6231895685195923, + "learning_rate": 0.0002, + "loss": 0.9765, + "step": 3440 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 1.5033475160598755, + "learning_rate": 0.0002, + "loss": 0.9607, + "step": 3450 + }, + { + "epoch": 7.688888888888889, + "grad_norm": 1.2845245599746704, + "learning_rate": 0.0002, + "loss": 0.9834, + "step": 3460 + }, + { + "epoch": 7.711111111111111, + "grad_norm": 1.3614885807037354, + "learning_rate": 0.0002, + "loss": 0.9956, + "step": 3470 + }, + { + "epoch": 7.733333333333333, + "grad_norm": 1.876365303993225, + "learning_rate": 0.0002, + "loss": 0.9207, + "step": 3480 + }, + { + "epoch": 7.7555555555555555, + "grad_norm": 1.5048887729644775, + "learning_rate": 0.0002, + "loss": 0.9616, + "step": 3490 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 1.401036024093628, + "learning_rate": 0.0002, + "loss": 0.9256, + "step": 3500 + }, + { + "epoch": 7.8, + "grad_norm": 1.4172956943511963, + "learning_rate": 0.0002, + "loss": 0.8659, + "step": 3510 + }, + { + "epoch": 7.822222222222222, + "grad_norm": 1.3779038190841675, + "learning_rate": 0.0002, + "loss": 0.9575, + "step": 3520 + }, + { + "epoch": 7.844444444444444, + "grad_norm": 1.2683740854263306, + "learning_rate": 0.0002, + "loss": 0.9083, + "step": 3530 + }, + { + "epoch": 7.866666666666667, + "grad_norm": 1.3728152513504028, + "learning_rate": 0.0002, + "loss": 1.0045, + "step": 3540 + }, + { + "epoch": 7.888888888888889, + "grad_norm": 1.5868757963180542, + "learning_rate": 0.0002, + "loss": 0.9687, + "step": 3550 + }, + { + "epoch": 7.911111111111111, + "grad_norm": 1.520365595817566, + "learning_rate": 0.0002, + "loss": 1.0928, + "step": 3560 + }, + { + "epoch": 7.933333333333334, + "grad_norm": 1.6288018226623535, + "learning_rate": 0.0002, + "loss": 0.9233, + "step": 3570 + }, + { + "epoch": 7.955555555555556, + "grad_norm": 1.3921650648117065, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 3580 + }, + { + "epoch": 7.977777777777778, + "grad_norm": 1.486502766609192, + "learning_rate": 0.0002, + "loss": 0.9452, + "step": 3590 + }, + { + "epoch": 8.0, + "grad_norm": 1.4413995742797852, + "learning_rate": 0.0002, + "loss": 1.0061, + "step": 3600 + }, + { + "epoch": 8.0, + "eval_loss": 2.388571262359619, + "eval_runtime": 102.0801, + "eval_samples_per_second": 5.045, + "eval_steps_per_second": 0.637, + "step": 3600 + } + ], + "logging_steps": 10, + "max_steps": 3600, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.665999392735232e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08bcb50b9f534803ed2a4a4be696b74ae373982a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-3600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:333a3e3301276a753d48f5fabf4c169b8961c6e6ef7b7a7f931ba463d72983a4 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..06afd51579b66923b33639ad60961eb341399e77 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:649a028eb9cd893add84964d42ea9e52315f8c0fe932aec2fa2032f55a567bf7 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..14bffde7c2ae91e4a1c471feb631367ba8e5b084 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5322a63a4a032549c8d1545df35de8003c7b7f04002fc68673a080840a007eab +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c177b3c1738c2b5fd06881062619d054b9b39627 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:accd8a1f32ea6eae6977db6c4081072553876a48e71633a79faa6441d93e8391 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3e2a6de4d9ee56a0c7ae745bdab726b119ed18f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e558e3f62da6fe1227d986e18a0ebb0d00f3e27b52e7f3a5386706bfe1f1a699 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..728cdec0c8e21983e7289cd7f8946e026f9260e8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/trainer_state.json @@ -0,0 +1,356 @@ +{ + "best_metric": 1.8310279846191406, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450", + "epoch": 1.0, + "eval_steps": 10, + "global_step": 450, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022222222222222223, + "grad_norm": 0.5109436511993408, + "learning_rate": 0.0002, + "loss": 2.5874, + "step": 10 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 0.4870035946369171, + "learning_rate": 0.0002, + "loss": 2.2952, + "step": 20 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.535464882850647, + "learning_rate": 0.0002, + "loss": 2.0554, + "step": 30 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.49077996611595154, + "learning_rate": 0.0002, + "loss": 2.0067, + "step": 40 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.4671357572078705, + "learning_rate": 0.0002, + "loss": 2.0673, + "step": 50 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.4970313608646393, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 60 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 0.4438260495662689, + "learning_rate": 0.0002, + "loss": 1.9419, + "step": 70 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.5089705586433411, + "learning_rate": 0.0002, + "loss": 1.9856, + "step": 80 + }, + { + "epoch": 0.2, + "grad_norm": 0.4645078182220459, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 90 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.46095192432403564, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 100 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 0.43338075280189514, + "learning_rate": 0.0002, + "loss": 1.891, + "step": 110 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.4433900713920593, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 120 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 0.7018499970436096, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 130 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 0.37056994438171387, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 140 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.40634623169898987, + "learning_rate": 0.0002, + "loss": 1.8059, + "step": 150 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.41917353868484497, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 160 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.42392489314079285, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 170 + }, + { + "epoch": 0.4, + "grad_norm": 0.4281010627746582, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 180 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 0.38542497158050537, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 190 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.36003032326698303, + "learning_rate": 0.0002, + "loss": 1.8951, + "step": 200 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.37858229875564575, + "learning_rate": 0.0002, + "loss": 1.8611, + "step": 210 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 0.49986031651496887, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 220 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 0.3937094807624817, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 230 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4566134512424469, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 240 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.3602476418018341, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 250 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 0.36321184039115906, + "learning_rate": 0.0002, + "loss": 1.7963, + "step": 260 + }, + { + "epoch": 0.6, + "grad_norm": 0.3808199167251587, + "learning_rate": 0.0002, + "loss": 1.9055, + "step": 270 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.38910621404647827, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 280 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 0.31913551688194275, + "learning_rate": 0.0002, + "loss": 1.7958, + "step": 290 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.34734025597572327, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 300 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 0.3517725467681885, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 310 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.3804526627063751, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 320 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.5592505931854248, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 330 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 0.36154472827911377, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 340 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.43970227241516113, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 350 + }, + { + "epoch": 0.8, + "grad_norm": 0.3525223731994629, + "learning_rate": 0.0002, + "loss": 1.8333, + "step": 360 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 0.3706997036933899, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 370 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 0.34138166904449463, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 380 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.4090622365474701, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 390 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.3729974031448364, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 400 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 0.3742152452468872, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 410 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.37685129046440125, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 420 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 430 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.31139856576919556, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 440 + }, + { + "epoch": 1.0, + "grad_norm": 0.3577502965927124, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8310279846191406, + "eval_runtime": 38.8374, + "eval_samples_per_second": 13.26, + "eval_steps_per_second": 1.674, + "step": 450 + } + ], + "logging_steps": 10, + "max_steps": 3600, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.08249924091904e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08bcb50b9f534803ed2a4a4be696b74ae373982a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:333a3e3301276a753d48f5fabf4c169b8961c6e6ef7b7a7f931ba463d72983a4 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..89a48e8f3f4598a00630f71918711bed7f28a108 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e013a1c133e2c4b36abf2e4a3ad2e255709b344b3cda721d8cb898dc5d1a8eb8 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a25530f0fab3f57ead47e47b4eea55ceb4707219 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fdfacbbcd67442b25d15195fe8d3a4dd6c9f8297ac7e48337933c5e76caa16b +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..640fb2f2bfe5f5946d6db899ea32753d56782cd1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8a5b4bdd1888dd62c1c437d5e0671ad84c31384ae96f0abbbf8844b571d3dfc +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..68a8750d59d4ea679dfcc0a10ed5b747b65b3dfa --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2edd15f3d3c2c6faad012b28b0ad0632bc5db50d5cf58f9f717dbe1dc8061e00 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..983c226f4a952708bbb9e5046d61cb8a7bf86b5f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/trainer_state.json @@ -0,0 +1,679 @@ +{ + "best_metric": 1.8250652551651, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022222222222222223, + "grad_norm": 0.5109436511993408, + "learning_rate": 0.0002, + "loss": 2.5874, + "step": 10 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 0.4870035946369171, + "learning_rate": 0.0002, + "loss": 2.2952, + "step": 20 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.535464882850647, + "learning_rate": 0.0002, + "loss": 2.0554, + "step": 30 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.49077996611595154, + "learning_rate": 0.0002, + "loss": 2.0067, + "step": 40 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.4671357572078705, + "learning_rate": 0.0002, + "loss": 2.0673, + "step": 50 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.4970313608646393, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 60 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 0.4438260495662689, + "learning_rate": 0.0002, + "loss": 1.9419, + "step": 70 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.5089705586433411, + "learning_rate": 0.0002, + "loss": 1.9856, + "step": 80 + }, + { + "epoch": 0.2, + "grad_norm": 0.4645078182220459, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 90 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.46095192432403564, + "learning_rate": 0.0002, + "loss": 1.807, + "step": 100 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 0.43338075280189514, + "learning_rate": 0.0002, + "loss": 1.891, + "step": 110 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.4433900713920593, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 120 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 0.7018499970436096, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 130 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 0.37056994438171387, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 140 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.40634623169898987, + "learning_rate": 0.0002, + "loss": 1.8059, + "step": 150 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.41917353868484497, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 160 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.42392489314079285, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 170 + }, + { + "epoch": 0.4, + "grad_norm": 0.4281010627746582, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 180 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 0.38542497158050537, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 190 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.36003032326698303, + "learning_rate": 0.0002, + "loss": 1.8951, + "step": 200 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.37858229875564575, + "learning_rate": 0.0002, + "loss": 1.8611, + "step": 210 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 0.49986031651496887, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 220 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 0.3937094807624817, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 230 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4566134512424469, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 240 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.3602476418018341, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 250 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 0.36321184039115906, + "learning_rate": 0.0002, + "loss": 1.7963, + "step": 260 + }, + { + "epoch": 0.6, + "grad_norm": 0.3808199167251587, + "learning_rate": 0.0002, + "loss": 1.9055, + "step": 270 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.38910621404647827, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 280 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 0.31913551688194275, + "learning_rate": 0.0002, + "loss": 1.7958, + "step": 290 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.34734025597572327, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 300 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 0.3517725467681885, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 310 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.3804526627063751, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 320 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.5592505931854248, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 330 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 0.36154472827911377, + "learning_rate": 0.0002, + "loss": 1.8827, + "step": 340 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.43970227241516113, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 350 + }, + { + "epoch": 0.8, + "grad_norm": 0.3525223731994629, + "learning_rate": 0.0002, + "loss": 1.8333, + "step": 360 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 0.3706997036933899, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 370 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 0.34138166904449463, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 380 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.4090622365474701, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 390 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.3729974031448364, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 400 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 0.3742152452468872, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 410 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.37685129046440125, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 420 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 430 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.31139856576919556, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 440 + }, + { + "epoch": 1.0, + "grad_norm": 0.3577502965927124, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 450 + }, + { + "epoch": 1.0, + "eval_loss": 1.8310279846191406, + "eval_runtime": 38.8374, + "eval_samples_per_second": 13.26, + "eval_steps_per_second": 1.674, + "step": 450 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 0.3257788419723511, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 460 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 0.3878970146179199, + "learning_rate": 0.0002, + "loss": 1.8031, + "step": 470 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.364427387714386, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 480 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 0.3374682664871216, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 490 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.35822123289108276, + "learning_rate": 0.0002, + "loss": 1.7308, + "step": 500 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.3748345673084259, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 510 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 0.3422437012195587, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 520 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 0.4289326071739197, + "learning_rate": 0.0002, + "loss": 1.7649, + "step": 530 + }, + { + "epoch": 1.2, + "grad_norm": 0.3706769645214081, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 540 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.4024733603000641, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 550 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 0.3960128128528595, + "learning_rate": 0.0002, + "loss": 1.7641, + "step": 560 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.38222864270210266, + "learning_rate": 0.0002, + "loss": 1.755, + "step": 570 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 0.4073713421821594, + "learning_rate": 0.0002, + "loss": 1.735, + "step": 580 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 0.3875499963760376, + "learning_rate": 0.0002, + "loss": 1.7183, + "step": 590 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.39740806818008423, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 600 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.38432490825653076, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 610 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 0.402729868888855, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 620 + }, + { + "epoch": 1.4, + "grad_norm": 0.36683231592178345, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 630 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 0.3883286714553833, + "learning_rate": 0.0002, + "loss": 1.8059, + "step": 640 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4087409973144531, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 650 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.4042017459869385, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 660 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 0.40149256587028503, + "learning_rate": 0.0002, + "loss": 1.7466, + "step": 670 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 0.45146510004997253, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 680 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.4098089039325714, + "learning_rate": 0.0002, + "loss": 1.6923, + "step": 690 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.4181336760520935, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 700 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 1.3722974061965942, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 710 + }, + { + "epoch": 1.6, + "grad_norm": 0.3965230882167816, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 720 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 0.3842000663280487, + "learning_rate": 0.0002, + "loss": 1.7782, + "step": 730 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 0.3603688180446625, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 740 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.39973509311676025, + "learning_rate": 0.0002, + "loss": 1.7476, + "step": 750 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 0.3687385618686676, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 760 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 0.4267722964286804, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 770 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.41301295161247253, + "learning_rate": 0.0002, + "loss": 1.8041, + "step": 780 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 0.3945430517196655, + "learning_rate": 0.0002, + "loss": 1.7284, + "step": 790 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.4037930965423584, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 800 + }, + { + "epoch": 1.8, + "grad_norm": 0.406893253326416, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 810 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 0.4600457549095154, + "learning_rate": 0.0002, + "loss": 1.7628, + "step": 820 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 0.4195384085178375, + "learning_rate": 0.0002, + "loss": 1.8222, + "step": 830 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.3854130506515503, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 840 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.38279038667678833, + "learning_rate": 0.0002, + "loss": 1.7227, + "step": 850 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 0.38249439001083374, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 860 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.42977792024612427, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 870 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 0.4109351933002472, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 880 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 0.3734486699104309, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 890 + }, + { + "epoch": 2.0, + "grad_norm": 0.3603087067604065, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 900 + }, + { + "epoch": 2.0, + "eval_loss": 1.8250652551651, + "eval_runtime": 38.8657, + "eval_samples_per_second": 13.251, + "eval_steps_per_second": 1.672, + "step": 900 + } + ], + "logging_steps": 10, + "max_steps": 3600, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.16499848183808e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08bcb50b9f534803ed2a4a4be696b74ae373982a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:333a3e3301276a753d48f5fabf4c169b8961c6e6ef7b7a7f931ba463d72983a4 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08bcb50b9f534803ed2a4a4be696b74ae373982a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:333a3e3301276a753d48f5fabf4c169b8961c6e6ef7b7a7f931ba463d72983a4 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..21d9fc5b3c1bdbbaf0171d29a44d1674e7b4b609 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 1.0, "step": 450, "epoch_duration": 484.0347971916199, "total_accumulated_duration": 484.0347971916199, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5874, "grad_norm": 0.5109436511993408, "learning_rate": 0.0002, "epoch": 0.022222222222222223, "step": 10}, {"loss": 2.2952, "grad_norm": 0.4870035946369171, "learning_rate": 0.0002, "epoch": 0.044444444444444446, "step": 20}, {"loss": 2.0554, "grad_norm": 0.535464882850647, "learning_rate": 0.0002, "epoch": 0.06666666666666667, "step": 30}, {"loss": 2.0067, "grad_norm": 0.49077996611595154, "learning_rate": 0.0002, "epoch": 0.08888888888888889, "step": 40}, {"loss": 2.0673, "grad_norm": 0.4671357572078705, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 50}, {"loss": 1.8751, "grad_norm": 0.4970313608646393, "learning_rate": 0.0002, "epoch": 0.13333333333333333, "step": 60}, {"loss": 1.9419, "grad_norm": 0.4438260495662689, "learning_rate": 0.0002, "epoch": 0.15555555555555556, "step": 70}, {"loss": 1.9856, "grad_norm": 0.5089705586433411, "learning_rate": 0.0002, "epoch": 0.17777777777777778, "step": 80}, {"loss": 1.8805, "grad_norm": 0.4645078182220459, "learning_rate": 0.0002, "epoch": 0.2, "step": 90}, {"loss": 1.807, "grad_norm": 0.46095192432403564, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 100}, {"loss": 1.891, "grad_norm": 0.43338075280189514, "learning_rate": 0.0002, "epoch": 0.24444444444444444, "step": 110}, {"loss": 1.84, "grad_norm": 0.4433900713920593, "learning_rate": 0.0002, "epoch": 0.26666666666666666, "step": 120}, {"loss": 1.8895, "grad_norm": 0.7018499970436096, "learning_rate": 0.0002, "epoch": 0.28888888888888886, "step": 130}, {"loss": 1.8273, "grad_norm": 0.37056994438171387, "learning_rate": 0.0002, "epoch": 0.3111111111111111, "step": 140}, {"loss": 1.8059, "grad_norm": 0.40634623169898987, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 150}, {"loss": 1.8393, "grad_norm": 0.41917353868484497, "learning_rate": 0.0002, "epoch": 0.35555555555555557, "step": 160}, {"loss": 1.8658, "grad_norm": 0.42392489314079285, "learning_rate": 0.0002, "epoch": 0.37777777777777777, "step": 170}, {"loss": 1.8617, "grad_norm": 0.4281010627746582, "learning_rate": 0.0002, "epoch": 0.4, "step": 180}, {"loss": 1.8163, "grad_norm": 0.38542497158050537, "learning_rate": 0.0002, "epoch": 0.4222222222222222, "step": 190}, {"loss": 1.8951, "grad_norm": 0.36003032326698303, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 200}, {"loss": 1.8611, "grad_norm": 0.37858229875564575, "learning_rate": 0.0002, "epoch": 0.4666666666666667, "step": 210}, {"loss": 1.8078, "grad_norm": 0.49986031651496887, "learning_rate": 0.0002, "epoch": 0.4888888888888889, "step": 220}, {"loss": 1.7413, "grad_norm": 0.3937094807624817, "learning_rate": 0.0002, "epoch": 0.5111111111111111, "step": 230}, {"loss": 1.8386, "grad_norm": 0.4566134512424469, "learning_rate": 0.0002, "epoch": 0.5333333333333333, "step": 240}, {"loss": 1.8303, "grad_norm": 0.3602476418018341, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 250}, {"loss": 1.7963, "grad_norm": 0.36321184039115906, "learning_rate": 0.0002, "epoch": 0.5777777777777777, "step": 260}, {"loss": 1.9055, "grad_norm": 0.3808199167251587, "learning_rate": 0.0002, "epoch": 0.6, "step": 270}, {"loss": 1.7916, "grad_norm": 0.38910621404647827, "learning_rate": 0.0002, "epoch": 0.6222222222222222, "step": 280}, {"loss": 1.7958, "grad_norm": 0.31913551688194275, "learning_rate": 0.0002, "epoch": 0.6444444444444445, "step": 290}, {"loss": 1.7468, "grad_norm": 0.34734025597572327, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 300}, {"loss": 1.8092, "grad_norm": 0.3517725467681885, "learning_rate": 0.0002, "epoch": 0.6888888888888889, "step": 310}, {"loss": 1.7847, "grad_norm": 0.3804526627063751, "learning_rate": 0.0002, "epoch": 0.7111111111111111, "step": 320}, {"loss": 1.8131, "grad_norm": 0.5592505931854248, "learning_rate": 0.0002, "epoch": 0.7333333333333333, "step": 330}, {"loss": 1.8827, "grad_norm": 0.36154472827911377, "learning_rate": 0.0002, "epoch": 0.7555555555555555, "step": 340}, {"loss": 1.7764, "grad_norm": 0.43970227241516113, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 350}, {"loss": 1.8333, "grad_norm": 0.3525223731994629, "learning_rate": 0.0002, "epoch": 0.8, "step": 360}, {"loss": 1.7802, "grad_norm": 0.3706997036933899, "learning_rate": 0.0002, "epoch": 0.8222222222222222, "step": 370}, {"loss": 1.7937, "grad_norm": 0.34138166904449463, "learning_rate": 0.0002, "epoch": 0.8444444444444444, "step": 380}, {"loss": 1.8251, "grad_norm": 0.4090622365474701, "learning_rate": 0.0002, "epoch": 0.8666666666666667, "step": 390}, {"loss": 1.7685, "grad_norm": 0.3729974031448364, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 400}, {"loss": 1.836, "grad_norm": 0.3742152452468872, "learning_rate": 0.0002, "epoch": 0.9111111111111111, "step": 410}, {"loss": 1.7998, "grad_norm": 0.37685129046440125, "learning_rate": 0.0002, "epoch": 0.9333333333333333, "step": 420}, {"loss": 1.7601, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.9555555555555556, "step": 430}, {"loss": 1.7651, "grad_norm": 0.31139856576919556, "learning_rate": 0.0002, "epoch": 0.9777777777777777, "step": 440}, {"loss": 1.8471, "grad_norm": 0.3577502965927124, "learning_rate": 0.0002, "epoch": 1.0, "step": 450}]} +{"epoch": 2.0, "step": 900, "epoch_duration": 485.03746914863586, "total_accumulated_duration": 969.0722663402557, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-450", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5874, "grad_norm": 0.5109436511993408, "learning_rate": 0.0002, "epoch": 0.022222222222222223, "step": 10}, {"loss": 2.2952, "grad_norm": 0.4870035946369171, "learning_rate": 0.0002, "epoch": 0.044444444444444446, "step": 20}, {"loss": 2.0554, "grad_norm": 0.535464882850647, "learning_rate": 0.0002, "epoch": 0.06666666666666667, "step": 30}, {"loss": 2.0067, "grad_norm": 0.49077996611595154, "learning_rate": 0.0002, "epoch": 0.08888888888888889, "step": 40}, {"loss": 2.0673, "grad_norm": 0.4671357572078705, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 50}, {"loss": 1.8751, "grad_norm": 0.4970313608646393, "learning_rate": 0.0002, "epoch": 0.13333333333333333, "step": 60}, {"loss": 1.9419, "grad_norm": 0.4438260495662689, "learning_rate": 0.0002, "epoch": 0.15555555555555556, "step": 70}, {"loss": 1.9856, "grad_norm": 0.5089705586433411, "learning_rate": 0.0002, "epoch": 0.17777777777777778, "step": 80}, {"loss": 1.8805, "grad_norm": 0.4645078182220459, "learning_rate": 0.0002, "epoch": 0.2, "step": 90}, {"loss": 1.807, "grad_norm": 0.46095192432403564, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 100}, {"loss": 1.891, "grad_norm": 0.43338075280189514, "learning_rate": 0.0002, "epoch": 0.24444444444444444, "step": 110}, {"loss": 1.84, "grad_norm": 0.4433900713920593, "learning_rate": 0.0002, "epoch": 0.26666666666666666, "step": 120}, {"loss": 1.8895, "grad_norm": 0.7018499970436096, "learning_rate": 0.0002, "epoch": 0.28888888888888886, "step": 130}, {"loss": 1.8273, "grad_norm": 0.37056994438171387, "learning_rate": 0.0002, "epoch": 0.3111111111111111, "step": 140}, {"loss": 1.8059, "grad_norm": 0.40634623169898987, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 150}, {"loss": 1.8393, "grad_norm": 0.41917353868484497, "learning_rate": 0.0002, "epoch": 0.35555555555555557, "step": 160}, {"loss": 1.8658, "grad_norm": 0.42392489314079285, "learning_rate": 0.0002, "epoch": 0.37777777777777777, "step": 170}, {"loss": 1.8617, "grad_norm": 0.4281010627746582, "learning_rate": 0.0002, "epoch": 0.4, "step": 180}, {"loss": 1.8163, "grad_norm": 0.38542497158050537, "learning_rate": 0.0002, "epoch": 0.4222222222222222, "step": 190}, {"loss": 1.8951, "grad_norm": 0.36003032326698303, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 200}, {"loss": 1.8611, "grad_norm": 0.37858229875564575, "learning_rate": 0.0002, "epoch": 0.4666666666666667, "step": 210}, {"loss": 1.8078, "grad_norm": 0.49986031651496887, "learning_rate": 0.0002, "epoch": 0.4888888888888889, "step": 220}, {"loss": 1.7413, "grad_norm": 0.3937094807624817, "learning_rate": 0.0002, "epoch": 0.5111111111111111, "step": 230}, {"loss": 1.8386, "grad_norm": 0.4566134512424469, "learning_rate": 0.0002, "epoch": 0.5333333333333333, "step": 240}, {"loss": 1.8303, "grad_norm": 0.3602476418018341, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 250}, {"loss": 1.7963, "grad_norm": 0.36321184039115906, "learning_rate": 0.0002, "epoch": 0.5777777777777777, "step": 260}, {"loss": 1.9055, "grad_norm": 0.3808199167251587, "learning_rate": 0.0002, "epoch": 0.6, "step": 270}, {"loss": 1.7916, "grad_norm": 0.38910621404647827, "learning_rate": 0.0002, "epoch": 0.6222222222222222, "step": 280}, {"loss": 1.7958, "grad_norm": 0.31913551688194275, "learning_rate": 0.0002, "epoch": 0.6444444444444445, "step": 290}, {"loss": 1.7468, "grad_norm": 0.34734025597572327, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 300}, {"loss": 1.8092, "grad_norm": 0.3517725467681885, "learning_rate": 0.0002, "epoch": 0.6888888888888889, "step": 310}, {"loss": 1.7847, "grad_norm": 0.3804526627063751, "learning_rate": 0.0002, "epoch": 0.7111111111111111, "step": 320}, {"loss": 1.8131, "grad_norm": 0.5592505931854248, "learning_rate": 0.0002, "epoch": 0.7333333333333333, "step": 330}, {"loss": 1.8827, "grad_norm": 0.36154472827911377, "learning_rate": 0.0002, "epoch": 0.7555555555555555, "step": 340}, {"loss": 1.7764, "grad_norm": 0.43970227241516113, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 350}, {"loss": 1.8333, "grad_norm": 0.3525223731994629, "learning_rate": 0.0002, "epoch": 0.8, "step": 360}, {"loss": 1.7802, "grad_norm": 0.3706997036933899, "learning_rate": 0.0002, "epoch": 0.8222222222222222, "step": 370}, {"loss": 1.7937, "grad_norm": 0.34138166904449463, "learning_rate": 0.0002, "epoch": 0.8444444444444444, "step": 380}, {"loss": 1.8251, "grad_norm": 0.4090622365474701, "learning_rate": 0.0002, "epoch": 0.8666666666666667, "step": 390}, {"loss": 1.7685, "grad_norm": 0.3729974031448364, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 400}, {"loss": 1.836, "grad_norm": 0.3742152452468872, "learning_rate": 0.0002, "epoch": 0.9111111111111111, "step": 410}, {"loss": 1.7998, "grad_norm": 0.37685129046440125, "learning_rate": 0.0002, "epoch": 0.9333333333333333, "step": 420}, {"loss": 1.7601, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.9555555555555556, "step": 430}, {"loss": 1.7651, "grad_norm": 0.31139856576919556, "learning_rate": 0.0002, "epoch": 0.9777777777777777, "step": 440}, {"loss": 1.8471, "grad_norm": 0.3577502965927124, "learning_rate": 0.0002, "epoch": 1.0, "step": 450}, {"eval_loss": 1.8310279846191406, "eval_runtime": 38.8374, "eval_samples_per_second": 13.26, "eval_steps_per_second": 1.674, "epoch": 1.0, "step": 450}, {"loss": 1.7741, "grad_norm": 0.3257788419723511, "learning_rate": 0.0002, "epoch": 1.0222222222222221, "step": 460}, {"loss": 1.8031, "grad_norm": 0.3878970146179199, "learning_rate": 0.0002, "epoch": 1.0444444444444445, "step": 470}, {"loss": 1.701, "grad_norm": 0.364427387714386, "learning_rate": 0.0002, "epoch": 1.0666666666666667, "step": 480}, {"loss": 1.7832, "grad_norm": 0.3374682664871216, "learning_rate": 0.0002, "epoch": 1.0888888888888888, "step": 490}, {"loss": 1.7308, "grad_norm": 0.35822123289108276, "learning_rate": 0.0002, "epoch": 1.1111111111111112, "step": 500}, {"loss": 1.7753, "grad_norm": 0.3748345673084259, "learning_rate": 0.0002, "epoch": 1.1333333333333333, "step": 510}, {"loss": 1.7225, "grad_norm": 0.3422437012195587, "learning_rate": 0.0002, "epoch": 1.1555555555555554, "step": 520}, {"loss": 1.7649, "grad_norm": 0.4289326071739197, "learning_rate": 0.0002, "epoch": 1.1777777777777778, "step": 530}, {"loss": 1.7548, "grad_norm": 0.3706769645214081, "learning_rate": 0.0002, "epoch": 1.2, "step": 540}, {"loss": 1.6988, "grad_norm": 0.4024733603000641, "learning_rate": 0.0002, "epoch": 1.2222222222222223, "step": 550}, {"loss": 1.7641, "grad_norm": 0.3960128128528595, "learning_rate": 0.0002, "epoch": 1.2444444444444445, "step": 560}, {"loss": 1.755, "grad_norm": 0.38222864270210266, "learning_rate": 0.0002, "epoch": 1.2666666666666666, "step": 570}, {"loss": 1.735, "grad_norm": 0.4073713421821594, "learning_rate": 0.0002, "epoch": 1.2888888888888888, "step": 580}, {"loss": 1.7183, "grad_norm": 0.3875499963760376, "learning_rate": 0.0002, "epoch": 1.3111111111111111, "step": 590}, {"loss": 1.7492, "grad_norm": 0.39740806818008423, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 600}, {"loss": 1.8478, "grad_norm": 0.38432490825653076, "learning_rate": 0.0002, "epoch": 1.3555555555555556, "step": 610}, {"loss": 1.7327, "grad_norm": 0.402729868888855, "learning_rate": 0.0002, "epoch": 1.3777777777777778, "step": 620}, {"loss": 1.6634, "grad_norm": 0.36683231592178345, "learning_rate": 0.0002, "epoch": 1.4, "step": 630}, {"loss": 1.8059, "grad_norm": 0.3883286714553833, "learning_rate": 0.0002, "epoch": 1.4222222222222223, "step": 640}, {"loss": 1.7953, "grad_norm": 0.4087409973144531, "learning_rate": 0.0002, "epoch": 1.4444444444444444, "step": 650}, {"loss": 1.7491, "grad_norm": 0.4042017459869385, "learning_rate": 0.0002, "epoch": 1.4666666666666668, "step": 660}, {"loss": 1.7466, "grad_norm": 0.40149256587028503, "learning_rate": 0.0002, "epoch": 1.488888888888889, "step": 670}, {"loss": 1.7398, "grad_norm": 0.45146510004997253, "learning_rate": 0.0002, "epoch": 1.511111111111111, "step": 680}, {"loss": 1.6923, "grad_norm": 0.4098089039325714, "learning_rate": 0.0002, "epoch": 1.5333333333333332, "step": 690}, {"loss": 1.6847, "grad_norm": 0.4181336760520935, "learning_rate": 0.0002, "epoch": 1.5555555555555556, "step": 700}, {"loss": 1.7862, "grad_norm": 1.3722974061965942, "learning_rate": 0.0002, "epoch": 1.5777777777777777, "step": 710}, {"loss": 1.7424, "grad_norm": 0.3965230882167816, "learning_rate": 0.0002, "epoch": 1.6, "step": 720}, {"loss": 1.7782, "grad_norm": 0.3842000663280487, "learning_rate": 0.0002, "epoch": 1.6222222222222222, "step": 730}, {"loss": 1.7682, "grad_norm": 0.3603688180446625, "learning_rate": 0.0002, "epoch": 1.6444444444444444, "step": 740}, {"loss": 1.7476, "grad_norm": 0.39973509311676025, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 750}, {"loss": 1.7329, "grad_norm": 0.3687385618686676, "learning_rate": 0.0002, "epoch": 1.6888888888888889, "step": 760}, {"loss": 1.7987, "grad_norm": 0.4267722964286804, "learning_rate": 0.0002, "epoch": 1.7111111111111112, "step": 770}, {"loss": 1.8041, "grad_norm": 0.41301295161247253, "learning_rate": 0.0002, "epoch": 1.7333333333333334, "step": 780}, {"loss": 1.7284, "grad_norm": 0.3945430517196655, "learning_rate": 0.0002, "epoch": 1.7555555555555555, "step": 790}, {"loss": 1.6691, "grad_norm": 0.4037930965423584, "learning_rate": 0.0002, "epoch": 1.7777777777777777, "step": 800}, {"loss": 1.6874, "grad_norm": 0.406893253326416, "learning_rate": 0.0002, "epoch": 1.8, "step": 810}, {"loss": 1.7628, "grad_norm": 0.4600457549095154, "learning_rate": 0.0002, "epoch": 1.8222222222222222, "step": 820}, {"loss": 1.8222, "grad_norm": 0.4195384085178375, "learning_rate": 0.0002, "epoch": 1.8444444444444446, "step": 830}, {"loss": 1.7123, "grad_norm": 0.3854130506515503, "learning_rate": 0.0002, "epoch": 1.8666666666666667, "step": 840}, {"loss": 1.7227, "grad_norm": 0.38279038667678833, "learning_rate": 0.0002, "epoch": 1.8888888888888888, "step": 850}, {"loss": 1.6942, "grad_norm": 0.38249439001083374, "learning_rate": 0.0002, "epoch": 1.911111111111111, "step": 860}, {"loss": 1.8058, "grad_norm": 0.42977792024612427, "learning_rate": 0.0002, "epoch": 1.9333333333333333, "step": 870}, {"loss": 1.713, "grad_norm": 0.4109351933002472, "learning_rate": 0.0002, "epoch": 1.9555555555555557, "step": 880}, {"loss": 1.7035, "grad_norm": 0.3734486699104309, "learning_rate": 0.0002, "epoch": 1.9777777777777779, "step": 890}, {"loss": 1.7502, "grad_norm": 0.3603087067604065, "learning_rate": 0.0002, "epoch": 2.0, "step": 900}]} +{"epoch": 3.0, "step": 1350, "epoch_duration": 486.2263672351837, "total_accumulated_duration": 1455.2986335754395, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5874, "grad_norm": 0.5109436511993408, "learning_rate": 0.0002, "epoch": 0.022222222222222223, "step": 10}, {"loss": 2.2952, "grad_norm": 0.4870035946369171, "learning_rate": 0.0002, "epoch": 0.044444444444444446, "step": 20}, {"loss": 2.0554, "grad_norm": 0.535464882850647, "learning_rate": 0.0002, "epoch": 0.06666666666666667, "step": 30}, {"loss": 2.0067, "grad_norm": 0.49077996611595154, "learning_rate": 0.0002, "epoch": 0.08888888888888889, "step": 40}, {"loss": 2.0673, "grad_norm": 0.4671357572078705, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 50}, {"loss": 1.8751, "grad_norm": 0.4970313608646393, "learning_rate": 0.0002, "epoch": 0.13333333333333333, "step": 60}, {"loss": 1.9419, "grad_norm": 0.4438260495662689, "learning_rate": 0.0002, "epoch": 0.15555555555555556, "step": 70}, {"loss": 1.9856, "grad_norm": 0.5089705586433411, "learning_rate": 0.0002, "epoch": 0.17777777777777778, "step": 80}, {"loss": 1.8805, "grad_norm": 0.4645078182220459, "learning_rate": 0.0002, "epoch": 0.2, "step": 90}, {"loss": 1.807, "grad_norm": 0.46095192432403564, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 100}, {"loss": 1.891, "grad_norm": 0.43338075280189514, "learning_rate": 0.0002, "epoch": 0.24444444444444444, "step": 110}, {"loss": 1.84, "grad_norm": 0.4433900713920593, "learning_rate": 0.0002, "epoch": 0.26666666666666666, "step": 120}, {"loss": 1.8895, "grad_norm": 0.7018499970436096, "learning_rate": 0.0002, "epoch": 0.28888888888888886, "step": 130}, {"loss": 1.8273, "grad_norm": 0.37056994438171387, "learning_rate": 0.0002, "epoch": 0.3111111111111111, "step": 140}, {"loss": 1.8059, "grad_norm": 0.40634623169898987, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 150}, {"loss": 1.8393, "grad_norm": 0.41917353868484497, "learning_rate": 0.0002, "epoch": 0.35555555555555557, "step": 160}, {"loss": 1.8658, "grad_norm": 0.42392489314079285, "learning_rate": 0.0002, "epoch": 0.37777777777777777, "step": 170}, {"loss": 1.8617, "grad_norm": 0.4281010627746582, "learning_rate": 0.0002, "epoch": 0.4, "step": 180}, {"loss": 1.8163, "grad_norm": 0.38542497158050537, "learning_rate": 0.0002, "epoch": 0.4222222222222222, "step": 190}, {"loss": 1.8951, "grad_norm": 0.36003032326698303, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 200}, {"loss": 1.8611, "grad_norm": 0.37858229875564575, "learning_rate": 0.0002, "epoch": 0.4666666666666667, "step": 210}, {"loss": 1.8078, "grad_norm": 0.49986031651496887, "learning_rate": 0.0002, "epoch": 0.4888888888888889, "step": 220}, {"loss": 1.7413, "grad_norm": 0.3937094807624817, "learning_rate": 0.0002, "epoch": 0.5111111111111111, "step": 230}, {"loss": 1.8386, "grad_norm": 0.4566134512424469, "learning_rate": 0.0002, "epoch": 0.5333333333333333, "step": 240}, {"loss": 1.8303, "grad_norm": 0.3602476418018341, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 250}, {"loss": 1.7963, "grad_norm": 0.36321184039115906, "learning_rate": 0.0002, "epoch": 0.5777777777777777, "step": 260}, {"loss": 1.9055, "grad_norm": 0.3808199167251587, "learning_rate": 0.0002, "epoch": 0.6, "step": 270}, {"loss": 1.7916, "grad_norm": 0.38910621404647827, "learning_rate": 0.0002, "epoch": 0.6222222222222222, "step": 280}, {"loss": 1.7958, "grad_norm": 0.31913551688194275, "learning_rate": 0.0002, "epoch": 0.6444444444444445, "step": 290}, {"loss": 1.7468, "grad_norm": 0.34734025597572327, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 300}, {"loss": 1.8092, "grad_norm": 0.3517725467681885, "learning_rate": 0.0002, "epoch": 0.6888888888888889, "step": 310}, {"loss": 1.7847, "grad_norm": 0.3804526627063751, "learning_rate": 0.0002, "epoch": 0.7111111111111111, "step": 320}, {"loss": 1.8131, "grad_norm": 0.5592505931854248, "learning_rate": 0.0002, "epoch": 0.7333333333333333, "step": 330}, {"loss": 1.8827, "grad_norm": 0.36154472827911377, "learning_rate": 0.0002, "epoch": 0.7555555555555555, "step": 340}, {"loss": 1.7764, "grad_norm": 0.43970227241516113, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 350}, {"loss": 1.8333, "grad_norm": 0.3525223731994629, "learning_rate": 0.0002, "epoch": 0.8, "step": 360}, {"loss": 1.7802, "grad_norm": 0.3706997036933899, "learning_rate": 0.0002, "epoch": 0.8222222222222222, "step": 370}, {"loss": 1.7937, "grad_norm": 0.34138166904449463, "learning_rate": 0.0002, "epoch": 0.8444444444444444, "step": 380}, {"loss": 1.8251, "grad_norm": 0.4090622365474701, "learning_rate": 0.0002, "epoch": 0.8666666666666667, "step": 390}, {"loss": 1.7685, "grad_norm": 0.3729974031448364, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 400}, {"loss": 1.836, "grad_norm": 0.3742152452468872, "learning_rate": 0.0002, "epoch": 0.9111111111111111, "step": 410}, {"loss": 1.7998, "grad_norm": 0.37685129046440125, "learning_rate": 0.0002, "epoch": 0.9333333333333333, "step": 420}, {"loss": 1.7601, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.9555555555555556, "step": 430}, {"loss": 1.7651, "grad_norm": 0.31139856576919556, "learning_rate": 0.0002, "epoch": 0.9777777777777777, "step": 440}, {"loss": 1.8471, "grad_norm": 0.3577502965927124, "learning_rate": 0.0002, "epoch": 1.0, "step": 450}, {"eval_loss": 1.8310279846191406, "eval_runtime": 38.8374, "eval_samples_per_second": 13.26, "eval_steps_per_second": 1.674, "epoch": 1.0, "step": 450}, {"loss": 1.7741, "grad_norm": 0.3257788419723511, "learning_rate": 0.0002, "epoch": 1.0222222222222221, "step": 460}, {"loss": 1.8031, "grad_norm": 0.3878970146179199, "learning_rate": 0.0002, "epoch": 1.0444444444444445, "step": 470}, {"loss": 1.701, "grad_norm": 0.364427387714386, "learning_rate": 0.0002, "epoch": 1.0666666666666667, "step": 480}, {"loss": 1.7832, "grad_norm": 0.3374682664871216, "learning_rate": 0.0002, "epoch": 1.0888888888888888, "step": 490}, {"loss": 1.7308, "grad_norm": 0.35822123289108276, "learning_rate": 0.0002, "epoch": 1.1111111111111112, "step": 500}, {"loss": 1.7753, "grad_norm": 0.3748345673084259, "learning_rate": 0.0002, "epoch": 1.1333333333333333, "step": 510}, {"loss": 1.7225, "grad_norm": 0.3422437012195587, "learning_rate": 0.0002, "epoch": 1.1555555555555554, "step": 520}, {"loss": 1.7649, "grad_norm": 0.4289326071739197, "learning_rate": 0.0002, "epoch": 1.1777777777777778, "step": 530}, {"loss": 1.7548, "grad_norm": 0.3706769645214081, "learning_rate": 0.0002, "epoch": 1.2, "step": 540}, {"loss": 1.6988, "grad_norm": 0.4024733603000641, "learning_rate": 0.0002, "epoch": 1.2222222222222223, "step": 550}, {"loss": 1.7641, "grad_norm": 0.3960128128528595, "learning_rate": 0.0002, "epoch": 1.2444444444444445, "step": 560}, {"loss": 1.755, "grad_norm": 0.38222864270210266, "learning_rate": 0.0002, "epoch": 1.2666666666666666, "step": 570}, {"loss": 1.735, "grad_norm": 0.4073713421821594, "learning_rate": 0.0002, "epoch": 1.2888888888888888, "step": 580}, {"loss": 1.7183, "grad_norm": 0.3875499963760376, "learning_rate": 0.0002, "epoch": 1.3111111111111111, "step": 590}, {"loss": 1.7492, "grad_norm": 0.39740806818008423, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 600}, {"loss": 1.8478, "grad_norm": 0.38432490825653076, "learning_rate": 0.0002, "epoch": 1.3555555555555556, "step": 610}, {"loss": 1.7327, "grad_norm": 0.402729868888855, "learning_rate": 0.0002, "epoch": 1.3777777777777778, "step": 620}, {"loss": 1.6634, "grad_norm": 0.36683231592178345, "learning_rate": 0.0002, "epoch": 1.4, "step": 630}, {"loss": 1.8059, "grad_norm": 0.3883286714553833, "learning_rate": 0.0002, "epoch": 1.4222222222222223, "step": 640}, {"loss": 1.7953, "grad_norm": 0.4087409973144531, "learning_rate": 0.0002, "epoch": 1.4444444444444444, "step": 650}, {"loss": 1.7491, "grad_norm": 0.4042017459869385, "learning_rate": 0.0002, "epoch": 1.4666666666666668, "step": 660}, {"loss": 1.7466, "grad_norm": 0.40149256587028503, "learning_rate": 0.0002, "epoch": 1.488888888888889, "step": 670}, {"loss": 1.7398, "grad_norm": 0.45146510004997253, "learning_rate": 0.0002, "epoch": 1.511111111111111, "step": 680}, {"loss": 1.6923, "grad_norm": 0.4098089039325714, "learning_rate": 0.0002, "epoch": 1.5333333333333332, "step": 690}, {"loss": 1.6847, "grad_norm": 0.4181336760520935, "learning_rate": 0.0002, "epoch": 1.5555555555555556, "step": 700}, {"loss": 1.7862, "grad_norm": 1.3722974061965942, "learning_rate": 0.0002, "epoch": 1.5777777777777777, "step": 710}, {"loss": 1.7424, "grad_norm": 0.3965230882167816, "learning_rate": 0.0002, "epoch": 1.6, "step": 720}, {"loss": 1.7782, "grad_norm": 0.3842000663280487, "learning_rate": 0.0002, "epoch": 1.6222222222222222, "step": 730}, {"loss": 1.7682, "grad_norm": 0.3603688180446625, "learning_rate": 0.0002, "epoch": 1.6444444444444444, "step": 740}, {"loss": 1.7476, "grad_norm": 0.39973509311676025, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 750}, {"loss": 1.7329, "grad_norm": 0.3687385618686676, "learning_rate": 0.0002, "epoch": 1.6888888888888889, "step": 760}, {"loss": 1.7987, "grad_norm": 0.4267722964286804, "learning_rate": 0.0002, "epoch": 1.7111111111111112, "step": 770}, {"loss": 1.8041, "grad_norm": 0.41301295161247253, "learning_rate": 0.0002, "epoch": 1.7333333333333334, "step": 780}, {"loss": 1.7284, "grad_norm": 0.3945430517196655, "learning_rate": 0.0002, "epoch": 1.7555555555555555, "step": 790}, {"loss": 1.6691, "grad_norm": 0.4037930965423584, "learning_rate": 0.0002, "epoch": 1.7777777777777777, "step": 800}, {"loss": 1.6874, "grad_norm": 0.406893253326416, "learning_rate": 0.0002, "epoch": 1.8, "step": 810}, {"loss": 1.7628, "grad_norm": 0.4600457549095154, "learning_rate": 0.0002, "epoch": 1.8222222222222222, "step": 820}, {"loss": 1.8222, "grad_norm": 0.4195384085178375, "learning_rate": 0.0002, "epoch": 1.8444444444444446, "step": 830}, {"loss": 1.7123, "grad_norm": 0.3854130506515503, "learning_rate": 0.0002, "epoch": 1.8666666666666667, "step": 840}, {"loss": 1.7227, "grad_norm": 0.38279038667678833, "learning_rate": 0.0002, "epoch": 1.8888888888888888, "step": 850}, {"loss": 1.6942, "grad_norm": 0.38249439001083374, "learning_rate": 0.0002, "epoch": 1.911111111111111, "step": 860}, {"loss": 1.8058, "grad_norm": 0.42977792024612427, "learning_rate": 0.0002, "epoch": 1.9333333333333333, "step": 870}, {"loss": 1.713, "grad_norm": 0.4109351933002472, "learning_rate": 0.0002, "epoch": 1.9555555555555557, "step": 880}, {"loss": 1.7035, "grad_norm": 0.3734486699104309, "learning_rate": 0.0002, "epoch": 1.9777777777777779, "step": 890}, {"loss": 1.7502, "grad_norm": 0.3603087067604065, "learning_rate": 0.0002, "epoch": 2.0, "step": 900}, {"eval_loss": 1.8250652551651, "eval_runtime": 38.8657, "eval_samples_per_second": 13.251, "eval_steps_per_second": 1.672, "epoch": 2.0, "step": 900}, {"loss": 1.6063, "grad_norm": 0.4014144241809845, "learning_rate": 0.0002, "epoch": 2.022222222222222, "step": 910}, {"loss": 1.6604, "grad_norm": 0.4338063597679138, "learning_rate": 0.0002, "epoch": 2.0444444444444443, "step": 920}, {"loss": 1.6354, "grad_norm": 0.3693605065345764, "learning_rate": 0.0002, "epoch": 2.066666666666667, "step": 930}, {"loss": 1.6186, "grad_norm": 0.4040255844593048, "learning_rate": 0.0002, "epoch": 2.088888888888889, "step": 940}, {"loss": 1.5976, "grad_norm": 0.43481820821762085, "learning_rate": 0.0002, "epoch": 2.111111111111111, "step": 950}, {"loss": 1.6081, "grad_norm": 0.41632869839668274, "learning_rate": 0.0002, "epoch": 2.1333333333333333, "step": 960}, {"loss": 1.6586, "grad_norm": 0.4633755385875702, "learning_rate": 0.0002, "epoch": 2.1555555555555554, "step": 970}, {"loss": 1.7396, "grad_norm": 0.43926581740379333, "learning_rate": 0.0002, "epoch": 2.1777777777777776, "step": 980}, {"loss": 1.5909, "grad_norm": 0.4757233262062073, "learning_rate": 0.0002, "epoch": 2.2, "step": 990}, {"loss": 1.648, "grad_norm": 0.5010586977005005, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 1000}, {"loss": 1.6454, "grad_norm": 0.44900986552238464, "learning_rate": 0.0002, "epoch": 2.2444444444444445, "step": 1010}, {"loss": 1.6493, "grad_norm": 0.41274750232696533, "learning_rate": 0.0002, "epoch": 2.2666666666666666, "step": 1020}, {"loss": 1.6592, "grad_norm": 0.44672393798828125, "learning_rate": 0.0002, "epoch": 2.2888888888888888, "step": 1030}, {"loss": 1.6349, "grad_norm": 0.4826269745826721, "learning_rate": 0.0002, "epoch": 2.311111111111111, "step": 1040}, {"loss": 1.5988, "grad_norm": 0.4650685489177704, "learning_rate": 0.0002, "epoch": 2.3333333333333335, "step": 1050}, {"loss": 1.5859, "grad_norm": 0.42507848143577576, "learning_rate": 0.0002, "epoch": 2.3555555555555556, "step": 1060}, {"loss": 1.5932, "grad_norm": 0.45653030276298523, "learning_rate": 0.0002, "epoch": 2.3777777777777778, "step": 1070}, {"loss": 1.6469, "grad_norm": 0.44534122943878174, "learning_rate": 0.0002, "epoch": 2.4, "step": 1080}, {"loss": 1.6115, "grad_norm": 0.4241289794445038, "learning_rate": 0.0002, "epoch": 2.422222222222222, "step": 1090}, {"loss": 1.6935, "grad_norm": 0.5004808306694031, "learning_rate": 0.0002, "epoch": 2.4444444444444446, "step": 1100}, {"loss": 1.6833, "grad_norm": 0.41425490379333496, "learning_rate": 0.0002, "epoch": 2.466666666666667, "step": 1110}, {"loss": 1.6151, "grad_norm": 0.44362279772758484, "learning_rate": 0.0002, "epoch": 2.488888888888889, "step": 1120}, {"loss": 1.6394, "grad_norm": 0.5530985593795776, "learning_rate": 0.0002, "epoch": 2.511111111111111, "step": 1130}, {"loss": 1.64, "grad_norm": 0.4290637969970703, "learning_rate": 0.0002, "epoch": 2.533333333333333, "step": 1140}, {"loss": 1.76, "grad_norm": 0.4957487881183624, "learning_rate": 0.0002, "epoch": 2.5555555555555554, "step": 1150}, {"loss": 1.613, "grad_norm": 0.5082747340202332, "learning_rate": 0.0002, "epoch": 2.5777777777777775, "step": 1160}, {"loss": 1.6702, "grad_norm": 0.478722482919693, "learning_rate": 0.0002, "epoch": 2.6, "step": 1170}, {"loss": 1.6198, "grad_norm": 0.436454176902771, "learning_rate": 0.0002, "epoch": 2.6222222222222222, "step": 1180}, {"loss": 1.663, "grad_norm": 0.4905032515525818, "learning_rate": 0.0002, "epoch": 2.6444444444444444, "step": 1190}, {"loss": 1.6216, "grad_norm": 0.4815700054168701, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 1200}, {"loss": 1.6994, "grad_norm": 0.3965534269809723, "learning_rate": 0.0002, "epoch": 2.688888888888889, "step": 1210}, {"loss": 1.744, "grad_norm": 0.43282169103622437, "learning_rate": 0.0002, "epoch": 2.7111111111111112, "step": 1220}, {"loss": 1.6455, "grad_norm": 0.45512479543685913, "learning_rate": 0.0002, "epoch": 2.7333333333333334, "step": 1230}, {"loss": 1.6444, "grad_norm": 0.44370076060295105, "learning_rate": 0.0002, "epoch": 2.7555555555555555, "step": 1240}, {"loss": 1.7106, "grad_norm": 0.4750686287879944, "learning_rate": 0.0002, "epoch": 2.7777777777777777, "step": 1250}, {"loss": 1.7122, "grad_norm": 0.41953766345977783, "learning_rate": 0.0002, "epoch": 2.8, "step": 1260}, {"loss": 1.6203, "grad_norm": 0.4887140095233917, "learning_rate": 0.0002, "epoch": 2.822222222222222, "step": 1270}, {"loss": 1.6691, "grad_norm": 0.46718958020210266, "learning_rate": 0.0002, "epoch": 2.8444444444444446, "step": 1280}, {"loss": 1.6257, "grad_norm": 0.48510900139808655, "learning_rate": 0.0002, "epoch": 2.8666666666666667, "step": 1290}, {"loss": 1.6825, "grad_norm": 0.4504084289073944, "learning_rate": 0.0002, "epoch": 2.888888888888889, "step": 1300}, {"loss": 1.7453, "grad_norm": 0.42119622230529785, "learning_rate": 0.0002, "epoch": 2.911111111111111, "step": 1310}, {"loss": 1.6662, "grad_norm": 0.4763694107532501, "learning_rate": 0.0002, "epoch": 2.9333333333333336, "step": 1320}, {"loss": 1.6151, "grad_norm": 0.422810822725296, "learning_rate": 0.0002, "epoch": 2.9555555555555557, "step": 1330}, {"loss": 1.6578, "grad_norm": 0.4768871068954468, "learning_rate": 0.0002, "epoch": 2.977777777777778, "step": 1340}, {"loss": 1.6167, "grad_norm": 0.48259881138801575, "learning_rate": 0.0002, "epoch": 3.0, "step": 1350}]} +{"epoch": 4.0, "step": 1800, "epoch_duration": 487.8836693763733, "total_accumulated_duration": 1943.1823029518127, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5874, "grad_norm": 0.5109436511993408, "learning_rate": 0.0002, "epoch": 0.022222222222222223, "step": 10}, {"loss": 2.2952, "grad_norm": 0.4870035946369171, "learning_rate": 0.0002, "epoch": 0.044444444444444446, "step": 20}, {"loss": 2.0554, "grad_norm": 0.535464882850647, "learning_rate": 0.0002, "epoch": 0.06666666666666667, "step": 30}, {"loss": 2.0067, "grad_norm": 0.49077996611595154, "learning_rate": 0.0002, "epoch": 0.08888888888888889, "step": 40}, {"loss": 2.0673, "grad_norm": 0.4671357572078705, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 50}, {"loss": 1.8751, "grad_norm": 0.4970313608646393, "learning_rate": 0.0002, "epoch": 0.13333333333333333, "step": 60}, {"loss": 1.9419, "grad_norm": 0.4438260495662689, "learning_rate": 0.0002, "epoch": 0.15555555555555556, "step": 70}, {"loss": 1.9856, "grad_norm": 0.5089705586433411, "learning_rate": 0.0002, "epoch": 0.17777777777777778, "step": 80}, {"loss": 1.8805, "grad_norm": 0.4645078182220459, "learning_rate": 0.0002, "epoch": 0.2, "step": 90}, {"loss": 1.807, "grad_norm": 0.46095192432403564, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 100}, {"loss": 1.891, "grad_norm": 0.43338075280189514, "learning_rate": 0.0002, "epoch": 0.24444444444444444, "step": 110}, {"loss": 1.84, "grad_norm": 0.4433900713920593, "learning_rate": 0.0002, "epoch": 0.26666666666666666, "step": 120}, {"loss": 1.8895, "grad_norm": 0.7018499970436096, "learning_rate": 0.0002, "epoch": 0.28888888888888886, "step": 130}, {"loss": 1.8273, "grad_norm": 0.37056994438171387, "learning_rate": 0.0002, "epoch": 0.3111111111111111, "step": 140}, {"loss": 1.8059, "grad_norm": 0.40634623169898987, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 150}, {"loss": 1.8393, "grad_norm": 0.41917353868484497, "learning_rate": 0.0002, "epoch": 0.35555555555555557, "step": 160}, {"loss": 1.8658, "grad_norm": 0.42392489314079285, "learning_rate": 0.0002, "epoch": 0.37777777777777777, "step": 170}, {"loss": 1.8617, "grad_norm": 0.4281010627746582, "learning_rate": 0.0002, "epoch": 0.4, "step": 180}, {"loss": 1.8163, "grad_norm": 0.38542497158050537, "learning_rate": 0.0002, "epoch": 0.4222222222222222, "step": 190}, {"loss": 1.8951, "grad_norm": 0.36003032326698303, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 200}, {"loss": 1.8611, "grad_norm": 0.37858229875564575, "learning_rate": 0.0002, "epoch": 0.4666666666666667, "step": 210}, {"loss": 1.8078, "grad_norm": 0.49986031651496887, "learning_rate": 0.0002, "epoch": 0.4888888888888889, "step": 220}, {"loss": 1.7413, "grad_norm": 0.3937094807624817, "learning_rate": 0.0002, "epoch": 0.5111111111111111, "step": 230}, {"loss": 1.8386, "grad_norm": 0.4566134512424469, "learning_rate": 0.0002, "epoch": 0.5333333333333333, "step": 240}, {"loss": 1.8303, "grad_norm": 0.3602476418018341, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 250}, {"loss": 1.7963, "grad_norm": 0.36321184039115906, "learning_rate": 0.0002, "epoch": 0.5777777777777777, "step": 260}, {"loss": 1.9055, "grad_norm": 0.3808199167251587, "learning_rate": 0.0002, "epoch": 0.6, "step": 270}, {"loss": 1.7916, "grad_norm": 0.38910621404647827, "learning_rate": 0.0002, "epoch": 0.6222222222222222, "step": 280}, {"loss": 1.7958, "grad_norm": 0.31913551688194275, "learning_rate": 0.0002, "epoch": 0.6444444444444445, "step": 290}, {"loss": 1.7468, "grad_norm": 0.34734025597572327, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 300}, {"loss": 1.8092, "grad_norm": 0.3517725467681885, "learning_rate": 0.0002, "epoch": 0.6888888888888889, "step": 310}, {"loss": 1.7847, "grad_norm": 0.3804526627063751, "learning_rate": 0.0002, "epoch": 0.7111111111111111, "step": 320}, {"loss": 1.8131, "grad_norm": 0.5592505931854248, "learning_rate": 0.0002, "epoch": 0.7333333333333333, "step": 330}, {"loss": 1.8827, "grad_norm": 0.36154472827911377, "learning_rate": 0.0002, "epoch": 0.7555555555555555, "step": 340}, {"loss": 1.7764, "grad_norm": 0.43970227241516113, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 350}, {"loss": 1.8333, "grad_norm": 0.3525223731994629, "learning_rate": 0.0002, "epoch": 0.8, "step": 360}, {"loss": 1.7802, "grad_norm": 0.3706997036933899, "learning_rate": 0.0002, "epoch": 0.8222222222222222, "step": 370}, {"loss": 1.7937, "grad_norm": 0.34138166904449463, "learning_rate": 0.0002, "epoch": 0.8444444444444444, "step": 380}, {"loss": 1.8251, "grad_norm": 0.4090622365474701, "learning_rate": 0.0002, "epoch": 0.8666666666666667, "step": 390}, {"loss": 1.7685, "grad_norm": 0.3729974031448364, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 400}, {"loss": 1.836, "grad_norm": 0.3742152452468872, "learning_rate": 0.0002, "epoch": 0.9111111111111111, "step": 410}, {"loss": 1.7998, "grad_norm": 0.37685129046440125, "learning_rate": 0.0002, "epoch": 0.9333333333333333, "step": 420}, {"loss": 1.7601, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.9555555555555556, "step": 430}, {"loss": 1.7651, "grad_norm": 0.31139856576919556, "learning_rate": 0.0002, "epoch": 0.9777777777777777, "step": 440}, {"loss": 1.8471, "grad_norm": 0.3577502965927124, "learning_rate": 0.0002, "epoch": 1.0, "step": 450}, {"eval_loss": 1.8310279846191406, "eval_runtime": 38.8374, "eval_samples_per_second": 13.26, "eval_steps_per_second": 1.674, "epoch": 1.0, "step": 450}, {"loss": 1.7741, "grad_norm": 0.3257788419723511, "learning_rate": 0.0002, "epoch": 1.0222222222222221, "step": 460}, {"loss": 1.8031, "grad_norm": 0.3878970146179199, "learning_rate": 0.0002, "epoch": 1.0444444444444445, "step": 470}, {"loss": 1.701, "grad_norm": 0.364427387714386, "learning_rate": 0.0002, "epoch": 1.0666666666666667, "step": 480}, {"loss": 1.7832, "grad_norm": 0.3374682664871216, "learning_rate": 0.0002, "epoch": 1.0888888888888888, "step": 490}, {"loss": 1.7308, "grad_norm": 0.35822123289108276, "learning_rate": 0.0002, "epoch": 1.1111111111111112, "step": 500}, {"loss": 1.7753, "grad_norm": 0.3748345673084259, "learning_rate": 0.0002, "epoch": 1.1333333333333333, "step": 510}, {"loss": 1.7225, "grad_norm": 0.3422437012195587, "learning_rate": 0.0002, "epoch": 1.1555555555555554, "step": 520}, {"loss": 1.7649, "grad_norm": 0.4289326071739197, "learning_rate": 0.0002, "epoch": 1.1777777777777778, "step": 530}, {"loss": 1.7548, "grad_norm": 0.3706769645214081, "learning_rate": 0.0002, "epoch": 1.2, "step": 540}, {"loss": 1.6988, "grad_norm": 0.4024733603000641, "learning_rate": 0.0002, "epoch": 1.2222222222222223, "step": 550}, {"loss": 1.7641, "grad_norm": 0.3960128128528595, "learning_rate": 0.0002, "epoch": 1.2444444444444445, "step": 560}, {"loss": 1.755, "grad_norm": 0.38222864270210266, "learning_rate": 0.0002, "epoch": 1.2666666666666666, "step": 570}, {"loss": 1.735, "grad_norm": 0.4073713421821594, "learning_rate": 0.0002, "epoch": 1.2888888888888888, "step": 580}, {"loss": 1.7183, "grad_norm": 0.3875499963760376, "learning_rate": 0.0002, "epoch": 1.3111111111111111, "step": 590}, {"loss": 1.7492, "grad_norm": 0.39740806818008423, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 600}, {"loss": 1.8478, "grad_norm": 0.38432490825653076, "learning_rate": 0.0002, "epoch": 1.3555555555555556, "step": 610}, {"loss": 1.7327, "grad_norm": 0.402729868888855, "learning_rate": 0.0002, "epoch": 1.3777777777777778, "step": 620}, {"loss": 1.6634, "grad_norm": 0.36683231592178345, "learning_rate": 0.0002, "epoch": 1.4, "step": 630}, {"loss": 1.8059, "grad_norm": 0.3883286714553833, "learning_rate": 0.0002, "epoch": 1.4222222222222223, "step": 640}, {"loss": 1.7953, "grad_norm": 0.4087409973144531, "learning_rate": 0.0002, "epoch": 1.4444444444444444, "step": 650}, {"loss": 1.7491, "grad_norm": 0.4042017459869385, "learning_rate": 0.0002, "epoch": 1.4666666666666668, "step": 660}, {"loss": 1.7466, "grad_norm": 0.40149256587028503, "learning_rate": 0.0002, "epoch": 1.488888888888889, "step": 670}, {"loss": 1.7398, "grad_norm": 0.45146510004997253, "learning_rate": 0.0002, "epoch": 1.511111111111111, "step": 680}, {"loss": 1.6923, "grad_norm": 0.4098089039325714, "learning_rate": 0.0002, "epoch": 1.5333333333333332, "step": 690}, {"loss": 1.6847, "grad_norm": 0.4181336760520935, "learning_rate": 0.0002, "epoch": 1.5555555555555556, "step": 700}, {"loss": 1.7862, "grad_norm": 1.3722974061965942, "learning_rate": 0.0002, "epoch": 1.5777777777777777, "step": 710}, {"loss": 1.7424, "grad_norm": 0.3965230882167816, "learning_rate": 0.0002, "epoch": 1.6, "step": 720}, {"loss": 1.7782, "grad_norm": 0.3842000663280487, "learning_rate": 0.0002, "epoch": 1.6222222222222222, "step": 730}, {"loss": 1.7682, "grad_norm": 0.3603688180446625, "learning_rate": 0.0002, "epoch": 1.6444444444444444, "step": 740}, {"loss": 1.7476, "grad_norm": 0.39973509311676025, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 750}, {"loss": 1.7329, "grad_norm": 0.3687385618686676, "learning_rate": 0.0002, "epoch": 1.6888888888888889, "step": 760}, {"loss": 1.7987, "grad_norm": 0.4267722964286804, "learning_rate": 0.0002, "epoch": 1.7111111111111112, "step": 770}, {"loss": 1.8041, "grad_norm": 0.41301295161247253, "learning_rate": 0.0002, "epoch": 1.7333333333333334, "step": 780}, {"loss": 1.7284, "grad_norm": 0.3945430517196655, "learning_rate": 0.0002, "epoch": 1.7555555555555555, "step": 790}, {"loss": 1.6691, "grad_norm": 0.4037930965423584, "learning_rate": 0.0002, "epoch": 1.7777777777777777, "step": 800}, {"loss": 1.6874, "grad_norm": 0.406893253326416, "learning_rate": 0.0002, "epoch": 1.8, "step": 810}, {"loss": 1.7628, "grad_norm": 0.4600457549095154, "learning_rate": 0.0002, "epoch": 1.8222222222222222, "step": 820}, {"loss": 1.8222, "grad_norm": 0.4195384085178375, "learning_rate": 0.0002, "epoch": 1.8444444444444446, "step": 830}, {"loss": 1.7123, "grad_norm": 0.3854130506515503, "learning_rate": 0.0002, "epoch": 1.8666666666666667, "step": 840}, {"loss": 1.7227, "grad_norm": 0.38279038667678833, "learning_rate": 0.0002, "epoch": 1.8888888888888888, "step": 850}, {"loss": 1.6942, "grad_norm": 0.38249439001083374, "learning_rate": 0.0002, "epoch": 1.911111111111111, "step": 860}, {"loss": 1.8058, "grad_norm": 0.42977792024612427, "learning_rate": 0.0002, "epoch": 1.9333333333333333, "step": 870}, {"loss": 1.713, "grad_norm": 0.4109351933002472, "learning_rate": 0.0002, "epoch": 1.9555555555555557, "step": 880}, {"loss": 1.7035, "grad_norm": 0.3734486699104309, "learning_rate": 0.0002, "epoch": 1.9777777777777779, "step": 890}, {"loss": 1.7502, "grad_norm": 0.3603087067604065, "learning_rate": 0.0002, "epoch": 2.0, "step": 900}, {"eval_loss": 1.8250652551651, "eval_runtime": 38.8657, "eval_samples_per_second": 13.251, "eval_steps_per_second": 1.672, "epoch": 2.0, "step": 900}, {"loss": 1.6063, "grad_norm": 0.4014144241809845, "learning_rate": 0.0002, "epoch": 2.022222222222222, "step": 910}, {"loss": 1.6604, "grad_norm": 0.4338063597679138, "learning_rate": 0.0002, "epoch": 2.0444444444444443, "step": 920}, {"loss": 1.6354, "grad_norm": 0.3693605065345764, "learning_rate": 0.0002, "epoch": 2.066666666666667, "step": 930}, {"loss": 1.6186, "grad_norm": 0.4040255844593048, "learning_rate": 0.0002, "epoch": 2.088888888888889, "step": 940}, {"loss": 1.5976, "grad_norm": 0.43481820821762085, "learning_rate": 0.0002, "epoch": 2.111111111111111, "step": 950}, {"loss": 1.6081, "grad_norm": 0.41632869839668274, "learning_rate": 0.0002, "epoch": 2.1333333333333333, "step": 960}, {"loss": 1.6586, "grad_norm": 0.4633755385875702, "learning_rate": 0.0002, "epoch": 2.1555555555555554, "step": 970}, {"loss": 1.7396, "grad_norm": 0.43926581740379333, "learning_rate": 0.0002, "epoch": 2.1777777777777776, "step": 980}, {"loss": 1.5909, "grad_norm": 0.4757233262062073, "learning_rate": 0.0002, "epoch": 2.2, "step": 990}, {"loss": 1.648, "grad_norm": 0.5010586977005005, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 1000}, {"loss": 1.6454, "grad_norm": 0.44900986552238464, "learning_rate": 0.0002, "epoch": 2.2444444444444445, "step": 1010}, {"loss": 1.6493, "grad_norm": 0.41274750232696533, "learning_rate": 0.0002, "epoch": 2.2666666666666666, "step": 1020}, {"loss": 1.6592, "grad_norm": 0.44672393798828125, "learning_rate": 0.0002, "epoch": 2.2888888888888888, "step": 1030}, {"loss": 1.6349, "grad_norm": 0.4826269745826721, "learning_rate": 0.0002, "epoch": 2.311111111111111, "step": 1040}, {"loss": 1.5988, "grad_norm": 0.4650685489177704, "learning_rate": 0.0002, "epoch": 2.3333333333333335, "step": 1050}, {"loss": 1.5859, "grad_norm": 0.42507848143577576, "learning_rate": 0.0002, "epoch": 2.3555555555555556, "step": 1060}, {"loss": 1.5932, "grad_norm": 0.45653030276298523, "learning_rate": 0.0002, "epoch": 2.3777777777777778, "step": 1070}, {"loss": 1.6469, "grad_norm": 0.44534122943878174, "learning_rate": 0.0002, "epoch": 2.4, "step": 1080}, {"loss": 1.6115, "grad_norm": 0.4241289794445038, "learning_rate": 0.0002, "epoch": 2.422222222222222, "step": 1090}, {"loss": 1.6935, "grad_norm": 0.5004808306694031, "learning_rate": 0.0002, "epoch": 2.4444444444444446, "step": 1100}, {"loss": 1.6833, "grad_norm": 0.41425490379333496, "learning_rate": 0.0002, "epoch": 2.466666666666667, "step": 1110}, {"loss": 1.6151, "grad_norm": 0.44362279772758484, "learning_rate": 0.0002, "epoch": 2.488888888888889, "step": 1120}, {"loss": 1.6394, "grad_norm": 0.5530985593795776, "learning_rate": 0.0002, "epoch": 2.511111111111111, "step": 1130}, {"loss": 1.64, "grad_norm": 0.4290637969970703, "learning_rate": 0.0002, "epoch": 2.533333333333333, "step": 1140}, {"loss": 1.76, "grad_norm": 0.4957487881183624, "learning_rate": 0.0002, "epoch": 2.5555555555555554, "step": 1150}, {"loss": 1.613, "grad_norm": 0.5082747340202332, "learning_rate": 0.0002, "epoch": 2.5777777777777775, "step": 1160}, {"loss": 1.6702, "grad_norm": 0.478722482919693, "learning_rate": 0.0002, "epoch": 2.6, "step": 1170}, {"loss": 1.6198, "grad_norm": 0.436454176902771, "learning_rate": 0.0002, "epoch": 2.6222222222222222, "step": 1180}, {"loss": 1.663, "grad_norm": 0.4905032515525818, "learning_rate": 0.0002, "epoch": 2.6444444444444444, "step": 1190}, {"loss": 1.6216, "grad_norm": 0.4815700054168701, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 1200}, {"loss": 1.6994, "grad_norm": 0.3965534269809723, "learning_rate": 0.0002, "epoch": 2.688888888888889, "step": 1210}, {"loss": 1.744, "grad_norm": 0.43282169103622437, "learning_rate": 0.0002, "epoch": 2.7111111111111112, "step": 1220}, {"loss": 1.6455, "grad_norm": 0.45512479543685913, "learning_rate": 0.0002, "epoch": 2.7333333333333334, "step": 1230}, {"loss": 1.6444, "grad_norm": 0.44370076060295105, "learning_rate": 0.0002, "epoch": 2.7555555555555555, "step": 1240}, {"loss": 1.7106, "grad_norm": 0.4750686287879944, "learning_rate": 0.0002, "epoch": 2.7777777777777777, "step": 1250}, {"loss": 1.7122, "grad_norm": 0.41953766345977783, "learning_rate": 0.0002, "epoch": 2.8, "step": 1260}, {"loss": 1.6203, "grad_norm": 0.4887140095233917, "learning_rate": 0.0002, "epoch": 2.822222222222222, "step": 1270}, {"loss": 1.6691, "grad_norm": 0.46718958020210266, "learning_rate": 0.0002, "epoch": 2.8444444444444446, "step": 1280}, {"loss": 1.6257, "grad_norm": 0.48510900139808655, "learning_rate": 0.0002, "epoch": 2.8666666666666667, "step": 1290}, {"loss": 1.6825, "grad_norm": 0.4504084289073944, "learning_rate": 0.0002, "epoch": 2.888888888888889, "step": 1300}, {"loss": 1.7453, "grad_norm": 0.42119622230529785, "learning_rate": 0.0002, "epoch": 2.911111111111111, "step": 1310}, {"loss": 1.6662, "grad_norm": 0.4763694107532501, "learning_rate": 0.0002, "epoch": 2.9333333333333336, "step": 1320}, {"loss": 1.6151, "grad_norm": 0.422810822725296, "learning_rate": 0.0002, "epoch": 2.9555555555555557, "step": 1330}, {"loss": 1.6578, "grad_norm": 0.4768871068954468, "learning_rate": 0.0002, "epoch": 2.977777777777778, "step": 1340}, {"loss": 1.6167, "grad_norm": 0.48259881138801575, "learning_rate": 0.0002, "epoch": 3.0, "step": 1350}, {"eval_loss": 1.8452560901641846, "eval_runtime": 38.8621, "eval_samples_per_second": 13.252, "eval_steps_per_second": 1.673, "epoch": 3.0, "step": 1350}, {"loss": 1.5351, "grad_norm": 0.6933313012123108, "learning_rate": 0.0002, "epoch": 3.022222222222222, "step": 1360}, {"loss": 1.5542, "grad_norm": 0.5870710611343384, "learning_rate": 0.0002, "epoch": 3.0444444444444443, "step": 1370}, {"loss": 1.511, "grad_norm": 0.602210283279419, "learning_rate": 0.0002, "epoch": 3.066666666666667, "step": 1380}, {"loss": 1.5272, "grad_norm": 0.6461787819862366, "learning_rate": 0.0002, "epoch": 3.088888888888889, "step": 1390}, {"loss": 1.4813, "grad_norm": 0.5839587450027466, "learning_rate": 0.0002, "epoch": 3.111111111111111, "step": 1400}, {"loss": 1.505, "grad_norm": 0.5757876038551331, "learning_rate": 0.0002, "epoch": 3.1333333333333333, "step": 1410}, {"loss": 1.4963, "grad_norm": 0.5862616300582886, "learning_rate": 0.0002, "epoch": 3.1555555555555554, "step": 1420}, {"loss": 1.5144, "grad_norm": 0.6103630065917969, "learning_rate": 0.0002, "epoch": 3.1777777777777776, "step": 1430}, {"loss": 1.5406, "grad_norm": 0.9309254884719849, "learning_rate": 0.0002, "epoch": 3.2, "step": 1440}, {"loss": 1.487, "grad_norm": 0.5360018014907837, "learning_rate": 0.0002, "epoch": 3.2222222222222223, "step": 1450}, {"loss": 1.5659, "grad_norm": 0.5448758602142334, "learning_rate": 0.0002, "epoch": 3.2444444444444445, "step": 1460}, {"loss": 1.5595, "grad_norm": 0.5973812341690063, "learning_rate": 0.0002, "epoch": 3.2666666666666666, "step": 1470}, {"loss": 1.5223, "grad_norm": 0.6245622038841248, "learning_rate": 0.0002, "epoch": 3.2888888888888888, "step": 1480}, {"loss": 1.4795, "grad_norm": 0.6533768773078918, "learning_rate": 0.0002, "epoch": 3.311111111111111, "step": 1490}, {"loss": 1.5562, "grad_norm": 0.5765811204910278, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 1500}, {"loss": 1.5405, "grad_norm": 0.591395378112793, "learning_rate": 0.0002, "epoch": 3.3555555555555556, "step": 1510}, {"loss": 1.5658, "grad_norm": 0.5842425227165222, "learning_rate": 0.0002, "epoch": 3.3777777777777778, "step": 1520}, {"loss": 1.5065, "grad_norm": 0.5731365084648132, "learning_rate": 0.0002, "epoch": 3.4, "step": 1530}, {"loss": 1.5438, "grad_norm": 0.5841306447982788, "learning_rate": 0.0002, "epoch": 3.422222222222222, "step": 1540}, {"loss": 1.4922, "grad_norm": 0.6503536701202393, "learning_rate": 0.0002, "epoch": 3.4444444444444446, "step": 1550}, {"loss": 1.5493, "grad_norm": 0.6170967221260071, "learning_rate": 0.0002, "epoch": 3.466666666666667, "step": 1560}, {"loss": 1.5098, "grad_norm": 0.5576487183570862, "learning_rate": 0.0002, "epoch": 3.488888888888889, "step": 1570}, {"loss": 1.472, "grad_norm": 0.7082911133766174, "learning_rate": 0.0002, "epoch": 3.511111111111111, "step": 1580}, {"loss": 1.5594, "grad_norm": 0.6159376502037048, "learning_rate": 0.0002, "epoch": 3.533333333333333, "step": 1590}, {"loss": 1.563, "grad_norm": 0.5972959399223328, "learning_rate": 0.0002, "epoch": 3.5555555555555554, "step": 1600}, {"loss": 1.4876, "grad_norm": 0.5787310004234314, "learning_rate": 0.0002, "epoch": 3.5777777777777775, "step": 1610}, {"loss": 1.4887, "grad_norm": 0.5846341252326965, "learning_rate": 0.0002, "epoch": 3.6, "step": 1620}, {"loss": 1.542, "grad_norm": 0.5906197428703308, "learning_rate": 0.0002, "epoch": 3.6222222222222222, "step": 1630}, {"loss": 1.4941, "grad_norm": 0.6305760145187378, "learning_rate": 0.0002, "epoch": 3.6444444444444444, "step": 1640}, {"loss": 1.4677, "grad_norm": 0.7448979616165161, "learning_rate": 0.0002, "epoch": 3.6666666666666665, "step": 1650}, {"loss": 1.5961, "grad_norm": 0.5906165242195129, "learning_rate": 0.0002, "epoch": 3.688888888888889, "step": 1660}, {"loss": 1.4882, "grad_norm": 0.605032742023468, "learning_rate": 0.0002, "epoch": 3.7111111111111112, "step": 1670}, {"loss": 1.5804, "grad_norm": 0.6117229461669922, "learning_rate": 0.0002, "epoch": 3.7333333333333334, "step": 1680}, {"loss": 1.5131, "grad_norm": 0.613581120967865, "learning_rate": 0.0002, "epoch": 3.7555555555555555, "step": 1690}, {"loss": 1.5074, "grad_norm": 0.6244436502456665, "learning_rate": 0.0002, "epoch": 3.7777777777777777, "step": 1700}, {"loss": 1.5738, "grad_norm": 0.6236702799797058, "learning_rate": 0.0002, "epoch": 3.8, "step": 1710}, {"loss": 1.6542, "grad_norm": 0.639141857624054, "learning_rate": 0.0002, "epoch": 3.822222222222222, "step": 1720}, {"loss": 1.536, "grad_norm": 0.5782344937324524, "learning_rate": 0.0002, "epoch": 3.8444444444444446, "step": 1730}, {"loss": 1.5355, "grad_norm": 0.5952938795089722, "learning_rate": 0.0002, "epoch": 3.8666666666666667, "step": 1740}, {"loss": 1.5205, "grad_norm": 0.5573042035102844, "learning_rate": 0.0002, "epoch": 3.888888888888889, "step": 1750}, {"loss": 1.5066, "grad_norm": 0.6114351749420166, "learning_rate": 0.0002, "epoch": 3.911111111111111, "step": 1760}, {"loss": 1.5706, "grad_norm": 0.5973817110061646, "learning_rate": 0.0002, "epoch": 3.9333333333333336, "step": 1770}, {"loss": 1.5003, "grad_norm": 0.602317750453949, "learning_rate": 0.0002, "epoch": 3.9555555555555557, "step": 1780}, {"loss": 1.5022, "grad_norm": 0.5965437293052673, "learning_rate": 0.0002, "epoch": 3.977777777777778, "step": 1790}, {"loss": 1.5031, "grad_norm": 0.5641552209854126, "learning_rate": 0.0002, "epoch": 4.0, "step": 1800}]} +{"epoch": 5.0, "step": 2250, "epoch_duration": 486.6091854572296, "total_accumulated_duration": 2429.7914884090424, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5874, "grad_norm": 0.5109436511993408, "learning_rate": 0.0002, "epoch": 0.022222222222222223, "step": 10}, {"loss": 2.2952, "grad_norm": 0.4870035946369171, "learning_rate": 0.0002, "epoch": 0.044444444444444446, "step": 20}, {"loss": 2.0554, "grad_norm": 0.535464882850647, "learning_rate": 0.0002, "epoch": 0.06666666666666667, "step": 30}, {"loss": 2.0067, "grad_norm": 0.49077996611595154, "learning_rate": 0.0002, "epoch": 0.08888888888888889, "step": 40}, {"loss": 2.0673, "grad_norm": 0.4671357572078705, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 50}, {"loss": 1.8751, "grad_norm": 0.4970313608646393, "learning_rate": 0.0002, "epoch": 0.13333333333333333, "step": 60}, {"loss": 1.9419, "grad_norm": 0.4438260495662689, "learning_rate": 0.0002, "epoch": 0.15555555555555556, "step": 70}, {"loss": 1.9856, "grad_norm": 0.5089705586433411, "learning_rate": 0.0002, "epoch": 0.17777777777777778, "step": 80}, {"loss": 1.8805, "grad_norm": 0.4645078182220459, "learning_rate": 0.0002, "epoch": 0.2, "step": 90}, {"loss": 1.807, "grad_norm": 0.46095192432403564, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 100}, {"loss": 1.891, "grad_norm": 0.43338075280189514, "learning_rate": 0.0002, "epoch": 0.24444444444444444, "step": 110}, {"loss": 1.84, "grad_norm": 0.4433900713920593, "learning_rate": 0.0002, "epoch": 0.26666666666666666, "step": 120}, {"loss": 1.8895, "grad_norm": 0.7018499970436096, "learning_rate": 0.0002, "epoch": 0.28888888888888886, "step": 130}, {"loss": 1.8273, "grad_norm": 0.37056994438171387, "learning_rate": 0.0002, "epoch": 0.3111111111111111, "step": 140}, {"loss": 1.8059, "grad_norm": 0.40634623169898987, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 150}, {"loss": 1.8393, "grad_norm": 0.41917353868484497, "learning_rate": 0.0002, "epoch": 0.35555555555555557, "step": 160}, {"loss": 1.8658, "grad_norm": 0.42392489314079285, "learning_rate": 0.0002, "epoch": 0.37777777777777777, "step": 170}, {"loss": 1.8617, "grad_norm": 0.4281010627746582, "learning_rate": 0.0002, "epoch": 0.4, "step": 180}, {"loss": 1.8163, "grad_norm": 0.38542497158050537, "learning_rate": 0.0002, "epoch": 0.4222222222222222, "step": 190}, {"loss": 1.8951, "grad_norm": 0.36003032326698303, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 200}, {"loss": 1.8611, "grad_norm": 0.37858229875564575, "learning_rate": 0.0002, "epoch": 0.4666666666666667, "step": 210}, {"loss": 1.8078, "grad_norm": 0.49986031651496887, "learning_rate": 0.0002, "epoch": 0.4888888888888889, "step": 220}, {"loss": 1.7413, "grad_norm": 0.3937094807624817, "learning_rate": 0.0002, "epoch": 0.5111111111111111, "step": 230}, {"loss": 1.8386, "grad_norm": 0.4566134512424469, "learning_rate": 0.0002, "epoch": 0.5333333333333333, "step": 240}, {"loss": 1.8303, "grad_norm": 0.3602476418018341, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 250}, {"loss": 1.7963, "grad_norm": 0.36321184039115906, "learning_rate": 0.0002, "epoch": 0.5777777777777777, "step": 260}, {"loss": 1.9055, "grad_norm": 0.3808199167251587, "learning_rate": 0.0002, "epoch": 0.6, "step": 270}, {"loss": 1.7916, "grad_norm": 0.38910621404647827, "learning_rate": 0.0002, "epoch": 0.6222222222222222, "step": 280}, {"loss": 1.7958, "grad_norm": 0.31913551688194275, "learning_rate": 0.0002, "epoch": 0.6444444444444445, "step": 290}, {"loss": 1.7468, "grad_norm": 0.34734025597572327, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 300}, {"loss": 1.8092, "grad_norm": 0.3517725467681885, "learning_rate": 0.0002, "epoch": 0.6888888888888889, "step": 310}, {"loss": 1.7847, "grad_norm": 0.3804526627063751, "learning_rate": 0.0002, "epoch": 0.7111111111111111, "step": 320}, {"loss": 1.8131, "grad_norm": 0.5592505931854248, "learning_rate": 0.0002, "epoch": 0.7333333333333333, "step": 330}, {"loss": 1.8827, "grad_norm": 0.36154472827911377, "learning_rate": 0.0002, "epoch": 0.7555555555555555, "step": 340}, {"loss": 1.7764, "grad_norm": 0.43970227241516113, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 350}, {"loss": 1.8333, "grad_norm": 0.3525223731994629, "learning_rate": 0.0002, "epoch": 0.8, "step": 360}, {"loss": 1.7802, "grad_norm": 0.3706997036933899, "learning_rate": 0.0002, "epoch": 0.8222222222222222, "step": 370}, {"loss": 1.7937, "grad_norm": 0.34138166904449463, "learning_rate": 0.0002, "epoch": 0.8444444444444444, "step": 380}, {"loss": 1.8251, "grad_norm": 0.4090622365474701, "learning_rate": 0.0002, "epoch": 0.8666666666666667, "step": 390}, {"loss": 1.7685, "grad_norm": 0.3729974031448364, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 400}, {"loss": 1.836, "grad_norm": 0.3742152452468872, "learning_rate": 0.0002, "epoch": 0.9111111111111111, "step": 410}, {"loss": 1.7998, "grad_norm": 0.37685129046440125, "learning_rate": 0.0002, "epoch": 0.9333333333333333, "step": 420}, {"loss": 1.7601, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.9555555555555556, "step": 430}, {"loss": 1.7651, "grad_norm": 0.31139856576919556, "learning_rate": 0.0002, "epoch": 0.9777777777777777, "step": 440}, {"loss": 1.8471, "grad_norm": 0.3577502965927124, "learning_rate": 0.0002, "epoch": 1.0, "step": 450}, {"eval_loss": 1.8310279846191406, "eval_runtime": 38.8374, "eval_samples_per_second": 13.26, "eval_steps_per_second": 1.674, "epoch": 1.0, "step": 450}, {"loss": 1.7741, "grad_norm": 0.3257788419723511, "learning_rate": 0.0002, "epoch": 1.0222222222222221, "step": 460}, {"loss": 1.8031, "grad_norm": 0.3878970146179199, "learning_rate": 0.0002, "epoch": 1.0444444444444445, "step": 470}, {"loss": 1.701, "grad_norm": 0.364427387714386, "learning_rate": 0.0002, "epoch": 1.0666666666666667, "step": 480}, {"loss": 1.7832, "grad_norm": 0.3374682664871216, "learning_rate": 0.0002, "epoch": 1.0888888888888888, "step": 490}, {"loss": 1.7308, "grad_norm": 0.35822123289108276, "learning_rate": 0.0002, "epoch": 1.1111111111111112, "step": 500}, {"loss": 1.7753, "grad_norm": 0.3748345673084259, "learning_rate": 0.0002, "epoch": 1.1333333333333333, "step": 510}, {"loss": 1.7225, "grad_norm": 0.3422437012195587, "learning_rate": 0.0002, "epoch": 1.1555555555555554, "step": 520}, {"loss": 1.7649, "grad_norm": 0.4289326071739197, "learning_rate": 0.0002, "epoch": 1.1777777777777778, "step": 530}, {"loss": 1.7548, "grad_norm": 0.3706769645214081, "learning_rate": 0.0002, "epoch": 1.2, "step": 540}, {"loss": 1.6988, "grad_norm": 0.4024733603000641, "learning_rate": 0.0002, "epoch": 1.2222222222222223, "step": 550}, {"loss": 1.7641, "grad_norm": 0.3960128128528595, "learning_rate": 0.0002, "epoch": 1.2444444444444445, "step": 560}, {"loss": 1.755, "grad_norm": 0.38222864270210266, "learning_rate": 0.0002, "epoch": 1.2666666666666666, "step": 570}, {"loss": 1.735, "grad_norm": 0.4073713421821594, "learning_rate": 0.0002, "epoch": 1.2888888888888888, "step": 580}, {"loss": 1.7183, "grad_norm": 0.3875499963760376, "learning_rate": 0.0002, "epoch": 1.3111111111111111, "step": 590}, {"loss": 1.7492, "grad_norm": 0.39740806818008423, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 600}, {"loss": 1.8478, "grad_norm": 0.38432490825653076, "learning_rate": 0.0002, "epoch": 1.3555555555555556, "step": 610}, {"loss": 1.7327, "grad_norm": 0.402729868888855, "learning_rate": 0.0002, "epoch": 1.3777777777777778, "step": 620}, {"loss": 1.6634, "grad_norm": 0.36683231592178345, "learning_rate": 0.0002, "epoch": 1.4, "step": 630}, {"loss": 1.8059, "grad_norm": 0.3883286714553833, "learning_rate": 0.0002, "epoch": 1.4222222222222223, "step": 640}, {"loss": 1.7953, "grad_norm": 0.4087409973144531, "learning_rate": 0.0002, "epoch": 1.4444444444444444, "step": 650}, {"loss": 1.7491, "grad_norm": 0.4042017459869385, "learning_rate": 0.0002, "epoch": 1.4666666666666668, "step": 660}, {"loss": 1.7466, "grad_norm": 0.40149256587028503, "learning_rate": 0.0002, "epoch": 1.488888888888889, "step": 670}, {"loss": 1.7398, "grad_norm": 0.45146510004997253, "learning_rate": 0.0002, "epoch": 1.511111111111111, "step": 680}, {"loss": 1.6923, "grad_norm": 0.4098089039325714, "learning_rate": 0.0002, "epoch": 1.5333333333333332, "step": 690}, {"loss": 1.6847, "grad_norm": 0.4181336760520935, "learning_rate": 0.0002, "epoch": 1.5555555555555556, "step": 700}, {"loss": 1.7862, "grad_norm": 1.3722974061965942, "learning_rate": 0.0002, "epoch": 1.5777777777777777, "step": 710}, {"loss": 1.7424, "grad_norm": 0.3965230882167816, "learning_rate": 0.0002, "epoch": 1.6, "step": 720}, {"loss": 1.7782, "grad_norm": 0.3842000663280487, "learning_rate": 0.0002, "epoch": 1.6222222222222222, "step": 730}, {"loss": 1.7682, "grad_norm": 0.3603688180446625, "learning_rate": 0.0002, "epoch": 1.6444444444444444, "step": 740}, {"loss": 1.7476, "grad_norm": 0.39973509311676025, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 750}, {"loss": 1.7329, "grad_norm": 0.3687385618686676, "learning_rate": 0.0002, "epoch": 1.6888888888888889, "step": 760}, {"loss": 1.7987, "grad_norm": 0.4267722964286804, "learning_rate": 0.0002, "epoch": 1.7111111111111112, "step": 770}, {"loss": 1.8041, "grad_norm": 0.41301295161247253, "learning_rate": 0.0002, "epoch": 1.7333333333333334, "step": 780}, {"loss": 1.7284, "grad_norm": 0.3945430517196655, "learning_rate": 0.0002, "epoch": 1.7555555555555555, "step": 790}, {"loss": 1.6691, "grad_norm": 0.4037930965423584, "learning_rate": 0.0002, "epoch": 1.7777777777777777, "step": 800}, {"loss": 1.6874, "grad_norm": 0.406893253326416, "learning_rate": 0.0002, "epoch": 1.8, "step": 810}, {"loss": 1.7628, "grad_norm": 0.4600457549095154, "learning_rate": 0.0002, "epoch": 1.8222222222222222, "step": 820}, {"loss": 1.8222, "grad_norm": 0.4195384085178375, "learning_rate": 0.0002, "epoch": 1.8444444444444446, "step": 830}, {"loss": 1.7123, "grad_norm": 0.3854130506515503, "learning_rate": 0.0002, "epoch": 1.8666666666666667, "step": 840}, {"loss": 1.7227, "grad_norm": 0.38279038667678833, "learning_rate": 0.0002, "epoch": 1.8888888888888888, "step": 850}, {"loss": 1.6942, "grad_norm": 0.38249439001083374, "learning_rate": 0.0002, "epoch": 1.911111111111111, "step": 860}, {"loss": 1.8058, "grad_norm": 0.42977792024612427, "learning_rate": 0.0002, "epoch": 1.9333333333333333, "step": 870}, {"loss": 1.713, "grad_norm": 0.4109351933002472, "learning_rate": 0.0002, "epoch": 1.9555555555555557, "step": 880}, {"loss": 1.7035, "grad_norm": 0.3734486699104309, "learning_rate": 0.0002, "epoch": 1.9777777777777779, "step": 890}, {"loss": 1.7502, "grad_norm": 0.3603087067604065, "learning_rate": 0.0002, "epoch": 2.0, "step": 900}, {"eval_loss": 1.8250652551651, "eval_runtime": 38.8657, "eval_samples_per_second": 13.251, "eval_steps_per_second": 1.672, "epoch": 2.0, "step": 900}, {"loss": 1.6063, "grad_norm": 0.4014144241809845, "learning_rate": 0.0002, "epoch": 2.022222222222222, "step": 910}, {"loss": 1.6604, "grad_norm": 0.4338063597679138, "learning_rate": 0.0002, "epoch": 2.0444444444444443, "step": 920}, {"loss": 1.6354, "grad_norm": 0.3693605065345764, "learning_rate": 0.0002, "epoch": 2.066666666666667, "step": 930}, {"loss": 1.6186, "grad_norm": 0.4040255844593048, "learning_rate": 0.0002, "epoch": 2.088888888888889, "step": 940}, {"loss": 1.5976, "grad_norm": 0.43481820821762085, "learning_rate": 0.0002, "epoch": 2.111111111111111, "step": 950}, {"loss": 1.6081, "grad_norm": 0.41632869839668274, "learning_rate": 0.0002, "epoch": 2.1333333333333333, "step": 960}, {"loss": 1.6586, "grad_norm": 0.4633755385875702, "learning_rate": 0.0002, "epoch": 2.1555555555555554, "step": 970}, {"loss": 1.7396, "grad_norm": 0.43926581740379333, "learning_rate": 0.0002, "epoch": 2.1777777777777776, "step": 980}, {"loss": 1.5909, "grad_norm": 0.4757233262062073, "learning_rate": 0.0002, "epoch": 2.2, "step": 990}, {"loss": 1.648, "grad_norm": 0.5010586977005005, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 1000}, {"loss": 1.6454, "grad_norm": 0.44900986552238464, "learning_rate": 0.0002, "epoch": 2.2444444444444445, "step": 1010}, {"loss": 1.6493, "grad_norm": 0.41274750232696533, "learning_rate": 0.0002, "epoch": 2.2666666666666666, "step": 1020}, {"loss": 1.6592, "grad_norm": 0.44672393798828125, "learning_rate": 0.0002, "epoch": 2.2888888888888888, "step": 1030}, {"loss": 1.6349, "grad_norm": 0.4826269745826721, "learning_rate": 0.0002, "epoch": 2.311111111111111, "step": 1040}, {"loss": 1.5988, "grad_norm": 0.4650685489177704, "learning_rate": 0.0002, "epoch": 2.3333333333333335, "step": 1050}, {"loss": 1.5859, "grad_norm": 0.42507848143577576, "learning_rate": 0.0002, "epoch": 2.3555555555555556, "step": 1060}, {"loss": 1.5932, "grad_norm": 0.45653030276298523, "learning_rate": 0.0002, "epoch": 2.3777777777777778, "step": 1070}, {"loss": 1.6469, "grad_norm": 0.44534122943878174, "learning_rate": 0.0002, "epoch": 2.4, "step": 1080}, {"loss": 1.6115, "grad_norm": 0.4241289794445038, "learning_rate": 0.0002, "epoch": 2.422222222222222, "step": 1090}, {"loss": 1.6935, "grad_norm": 0.5004808306694031, "learning_rate": 0.0002, "epoch": 2.4444444444444446, "step": 1100}, {"loss": 1.6833, "grad_norm": 0.41425490379333496, "learning_rate": 0.0002, "epoch": 2.466666666666667, "step": 1110}, {"loss": 1.6151, "grad_norm": 0.44362279772758484, "learning_rate": 0.0002, "epoch": 2.488888888888889, "step": 1120}, {"loss": 1.6394, "grad_norm": 0.5530985593795776, "learning_rate": 0.0002, "epoch": 2.511111111111111, "step": 1130}, {"loss": 1.64, "grad_norm": 0.4290637969970703, "learning_rate": 0.0002, "epoch": 2.533333333333333, "step": 1140}, {"loss": 1.76, "grad_norm": 0.4957487881183624, "learning_rate": 0.0002, "epoch": 2.5555555555555554, "step": 1150}, {"loss": 1.613, "grad_norm": 0.5082747340202332, "learning_rate": 0.0002, "epoch": 2.5777777777777775, "step": 1160}, {"loss": 1.6702, "grad_norm": 0.478722482919693, "learning_rate": 0.0002, "epoch": 2.6, "step": 1170}, {"loss": 1.6198, "grad_norm": 0.436454176902771, "learning_rate": 0.0002, "epoch": 2.6222222222222222, "step": 1180}, {"loss": 1.663, "grad_norm": 0.4905032515525818, "learning_rate": 0.0002, "epoch": 2.6444444444444444, "step": 1190}, {"loss": 1.6216, "grad_norm": 0.4815700054168701, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 1200}, {"loss": 1.6994, "grad_norm": 0.3965534269809723, "learning_rate": 0.0002, "epoch": 2.688888888888889, "step": 1210}, {"loss": 1.744, "grad_norm": 0.43282169103622437, "learning_rate": 0.0002, "epoch": 2.7111111111111112, "step": 1220}, {"loss": 1.6455, "grad_norm": 0.45512479543685913, "learning_rate": 0.0002, "epoch": 2.7333333333333334, "step": 1230}, {"loss": 1.6444, "grad_norm": 0.44370076060295105, "learning_rate": 0.0002, "epoch": 2.7555555555555555, "step": 1240}, {"loss": 1.7106, "grad_norm": 0.4750686287879944, "learning_rate": 0.0002, "epoch": 2.7777777777777777, "step": 1250}, {"loss": 1.7122, "grad_norm": 0.41953766345977783, "learning_rate": 0.0002, "epoch": 2.8, "step": 1260}, {"loss": 1.6203, "grad_norm": 0.4887140095233917, "learning_rate": 0.0002, "epoch": 2.822222222222222, "step": 1270}, {"loss": 1.6691, "grad_norm": 0.46718958020210266, "learning_rate": 0.0002, "epoch": 2.8444444444444446, "step": 1280}, {"loss": 1.6257, "grad_norm": 0.48510900139808655, "learning_rate": 0.0002, "epoch": 2.8666666666666667, "step": 1290}, {"loss": 1.6825, "grad_norm": 0.4504084289073944, "learning_rate": 0.0002, "epoch": 2.888888888888889, "step": 1300}, {"loss": 1.7453, "grad_norm": 0.42119622230529785, "learning_rate": 0.0002, "epoch": 2.911111111111111, "step": 1310}, {"loss": 1.6662, "grad_norm": 0.4763694107532501, "learning_rate": 0.0002, "epoch": 2.9333333333333336, "step": 1320}, {"loss": 1.6151, "grad_norm": 0.422810822725296, "learning_rate": 0.0002, "epoch": 2.9555555555555557, "step": 1330}, {"loss": 1.6578, "grad_norm": 0.4768871068954468, "learning_rate": 0.0002, "epoch": 2.977777777777778, "step": 1340}, {"loss": 1.6167, "grad_norm": 0.48259881138801575, "learning_rate": 0.0002, "epoch": 3.0, "step": 1350}, {"eval_loss": 1.8452560901641846, "eval_runtime": 38.8621, "eval_samples_per_second": 13.252, "eval_steps_per_second": 1.673, "epoch": 3.0, "step": 1350}, {"loss": 1.5351, "grad_norm": 0.6933313012123108, "learning_rate": 0.0002, "epoch": 3.022222222222222, "step": 1360}, {"loss": 1.5542, "grad_norm": 0.5870710611343384, "learning_rate": 0.0002, "epoch": 3.0444444444444443, "step": 1370}, {"loss": 1.511, "grad_norm": 0.602210283279419, "learning_rate": 0.0002, "epoch": 3.066666666666667, "step": 1380}, {"loss": 1.5272, "grad_norm": 0.6461787819862366, "learning_rate": 0.0002, "epoch": 3.088888888888889, "step": 1390}, {"loss": 1.4813, "grad_norm": 0.5839587450027466, "learning_rate": 0.0002, "epoch": 3.111111111111111, "step": 1400}, {"loss": 1.505, "grad_norm": 0.5757876038551331, "learning_rate": 0.0002, "epoch": 3.1333333333333333, "step": 1410}, {"loss": 1.4963, "grad_norm": 0.5862616300582886, "learning_rate": 0.0002, "epoch": 3.1555555555555554, "step": 1420}, {"loss": 1.5144, "grad_norm": 0.6103630065917969, "learning_rate": 0.0002, "epoch": 3.1777777777777776, "step": 1430}, {"loss": 1.5406, "grad_norm": 0.9309254884719849, "learning_rate": 0.0002, "epoch": 3.2, "step": 1440}, {"loss": 1.487, "grad_norm": 0.5360018014907837, "learning_rate": 0.0002, "epoch": 3.2222222222222223, "step": 1450}, {"loss": 1.5659, "grad_norm": 0.5448758602142334, "learning_rate": 0.0002, "epoch": 3.2444444444444445, "step": 1460}, {"loss": 1.5595, "grad_norm": 0.5973812341690063, "learning_rate": 0.0002, "epoch": 3.2666666666666666, "step": 1470}, {"loss": 1.5223, "grad_norm": 0.6245622038841248, "learning_rate": 0.0002, "epoch": 3.2888888888888888, "step": 1480}, {"loss": 1.4795, "grad_norm": 0.6533768773078918, "learning_rate": 0.0002, "epoch": 3.311111111111111, "step": 1490}, {"loss": 1.5562, "grad_norm": 0.5765811204910278, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 1500}, {"loss": 1.5405, "grad_norm": 0.591395378112793, "learning_rate": 0.0002, "epoch": 3.3555555555555556, "step": 1510}, {"loss": 1.5658, "grad_norm": 0.5842425227165222, "learning_rate": 0.0002, "epoch": 3.3777777777777778, "step": 1520}, {"loss": 1.5065, "grad_norm": 0.5731365084648132, "learning_rate": 0.0002, "epoch": 3.4, "step": 1530}, {"loss": 1.5438, "grad_norm": 0.5841306447982788, "learning_rate": 0.0002, "epoch": 3.422222222222222, "step": 1540}, {"loss": 1.4922, "grad_norm": 0.6503536701202393, "learning_rate": 0.0002, "epoch": 3.4444444444444446, "step": 1550}, {"loss": 1.5493, "grad_norm": 0.6170967221260071, "learning_rate": 0.0002, "epoch": 3.466666666666667, "step": 1560}, {"loss": 1.5098, "grad_norm": 0.5576487183570862, "learning_rate": 0.0002, "epoch": 3.488888888888889, "step": 1570}, {"loss": 1.472, "grad_norm": 0.7082911133766174, "learning_rate": 0.0002, "epoch": 3.511111111111111, "step": 1580}, {"loss": 1.5594, "grad_norm": 0.6159376502037048, "learning_rate": 0.0002, "epoch": 3.533333333333333, "step": 1590}, {"loss": 1.563, "grad_norm": 0.5972959399223328, "learning_rate": 0.0002, "epoch": 3.5555555555555554, "step": 1600}, {"loss": 1.4876, "grad_norm": 0.5787310004234314, "learning_rate": 0.0002, "epoch": 3.5777777777777775, "step": 1610}, {"loss": 1.4887, "grad_norm": 0.5846341252326965, "learning_rate": 0.0002, "epoch": 3.6, "step": 1620}, {"loss": 1.542, "grad_norm": 0.5906197428703308, "learning_rate": 0.0002, "epoch": 3.6222222222222222, "step": 1630}, {"loss": 1.4941, "grad_norm": 0.6305760145187378, "learning_rate": 0.0002, "epoch": 3.6444444444444444, "step": 1640}, {"loss": 1.4677, "grad_norm": 0.7448979616165161, "learning_rate": 0.0002, "epoch": 3.6666666666666665, "step": 1650}, {"loss": 1.5961, "grad_norm": 0.5906165242195129, "learning_rate": 0.0002, "epoch": 3.688888888888889, "step": 1660}, {"loss": 1.4882, "grad_norm": 0.605032742023468, "learning_rate": 0.0002, "epoch": 3.7111111111111112, "step": 1670}, {"loss": 1.5804, "grad_norm": 0.6117229461669922, "learning_rate": 0.0002, "epoch": 3.7333333333333334, "step": 1680}, {"loss": 1.5131, "grad_norm": 0.613581120967865, "learning_rate": 0.0002, "epoch": 3.7555555555555555, "step": 1690}, {"loss": 1.5074, "grad_norm": 0.6244436502456665, "learning_rate": 0.0002, "epoch": 3.7777777777777777, "step": 1700}, {"loss": 1.5738, "grad_norm": 0.6236702799797058, "learning_rate": 0.0002, "epoch": 3.8, "step": 1710}, {"loss": 1.6542, "grad_norm": 0.639141857624054, "learning_rate": 0.0002, "epoch": 3.822222222222222, "step": 1720}, {"loss": 1.536, "grad_norm": 0.5782344937324524, "learning_rate": 0.0002, "epoch": 3.8444444444444446, "step": 1730}, {"loss": 1.5355, "grad_norm": 0.5952938795089722, "learning_rate": 0.0002, "epoch": 3.8666666666666667, "step": 1740}, {"loss": 1.5205, "grad_norm": 0.5573042035102844, "learning_rate": 0.0002, "epoch": 3.888888888888889, "step": 1750}, {"loss": 1.5066, "grad_norm": 0.6114351749420166, "learning_rate": 0.0002, "epoch": 3.911111111111111, "step": 1760}, {"loss": 1.5706, "grad_norm": 0.5973817110061646, "learning_rate": 0.0002, "epoch": 3.9333333333333336, "step": 1770}, {"loss": 1.5003, "grad_norm": 0.602317750453949, "learning_rate": 0.0002, "epoch": 3.9555555555555557, "step": 1780}, {"loss": 1.5022, "grad_norm": 0.5965437293052673, "learning_rate": 0.0002, "epoch": 3.977777777777778, "step": 1790}, {"loss": 1.5031, "grad_norm": 0.5641552209854126, "learning_rate": 0.0002, "epoch": 4.0, "step": 1800}, {"eval_loss": 1.892098069190979, "eval_runtime": 38.8755, "eval_samples_per_second": 13.247, "eval_steps_per_second": 1.672, "epoch": 4.0, "step": 1800}, {"loss": 1.3894, "grad_norm": 0.8302594423294067, "learning_rate": 0.0002, "epoch": 4.022222222222222, "step": 1810}, {"loss": 1.3727, "grad_norm": 0.6695230603218079, "learning_rate": 0.0002, "epoch": 4.044444444444444, "step": 1820}, {"loss": 1.3064, "grad_norm": 0.7911471128463745, "learning_rate": 0.0002, "epoch": 4.066666666666666, "step": 1830}, {"loss": 1.4574, "grad_norm": 0.7044888138771057, "learning_rate": 0.0002, "epoch": 4.088888888888889, "step": 1840}, {"loss": 1.3941, "grad_norm": 0.7057249546051025, "learning_rate": 0.0002, "epoch": 4.111111111111111, "step": 1850}, {"loss": 1.4052, "grad_norm": 0.8762815594673157, "learning_rate": 0.0002, "epoch": 4.133333333333334, "step": 1860}, {"loss": 1.3784, "grad_norm": 0.7619158029556274, "learning_rate": 0.0002, "epoch": 4.155555555555556, "step": 1870}, {"loss": 1.3581, "grad_norm": 0.7711658477783203, "learning_rate": 0.0002, "epoch": 4.177777777777778, "step": 1880}, {"loss": 1.3995, "grad_norm": 0.9732598662376404, "learning_rate": 0.0002, "epoch": 4.2, "step": 1890}, {"loss": 1.3353, "grad_norm": 0.9070265889167786, "learning_rate": 0.0002, "epoch": 4.222222222222222, "step": 1900}, {"loss": 1.3947, "grad_norm": 0.8274767994880676, "learning_rate": 0.0002, "epoch": 4.2444444444444445, "step": 1910}, {"loss": 1.3392, "grad_norm": 0.8514227271080017, "learning_rate": 0.0002, "epoch": 4.266666666666667, "step": 1920}, {"loss": 1.3492, "grad_norm": 0.7356534600257874, "learning_rate": 0.0002, "epoch": 4.288888888888889, "step": 1930}, {"loss": 1.3708, "grad_norm": 0.8226608037948608, "learning_rate": 0.0002, "epoch": 4.311111111111111, "step": 1940}, {"loss": 1.3652, "grad_norm": 0.8347907066345215, "learning_rate": 0.0002, "epoch": 4.333333333333333, "step": 1950}, {"loss": 1.3415, "grad_norm": 0.8509323000907898, "learning_rate": 0.0002, "epoch": 4.355555555555555, "step": 1960}, {"loss": 1.3796, "grad_norm": 0.8776063323020935, "learning_rate": 0.0002, "epoch": 4.377777777777778, "step": 1970}, {"loss": 1.438, "grad_norm": 0.8022271990776062, "learning_rate": 0.0002, "epoch": 4.4, "step": 1980}, {"loss": 1.3671, "grad_norm": 0.7984752058982849, "learning_rate": 0.0002, "epoch": 4.4222222222222225, "step": 1990}, {"loss": 1.4214, "grad_norm": 0.7349720001220703, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 2000}, {"loss": 1.4174, "grad_norm": 0.7778817415237427, "learning_rate": 0.0002, "epoch": 4.466666666666667, "step": 2010}, {"loss": 1.3365, "grad_norm": 0.9361467361450195, "learning_rate": 0.0002, "epoch": 4.488888888888889, "step": 2020}, {"loss": 1.4129, "grad_norm": 0.7839348912239075, "learning_rate": 0.0002, "epoch": 4.511111111111111, "step": 2030}, {"loss": 1.3761, "grad_norm": 0.8361981511116028, "learning_rate": 0.0002, "epoch": 4.533333333333333, "step": 2040}, {"loss": 1.4085, "grad_norm": 1.9877147674560547, "learning_rate": 0.0002, "epoch": 4.555555555555555, "step": 2050}, {"loss": 1.329, "grad_norm": 0.7506140470504761, "learning_rate": 0.0002, "epoch": 4.5777777777777775, "step": 2060}, {"loss": 1.3557, "grad_norm": 0.9493570327758789, "learning_rate": 0.0002, "epoch": 4.6, "step": 2070}, {"loss": 1.438, "grad_norm": 0.7198925018310547, "learning_rate": 0.0002, "epoch": 4.622222222222222, "step": 2080}, {"loss": 1.3892, "grad_norm": 0.7521472573280334, "learning_rate": 0.0002, "epoch": 4.644444444444445, "step": 2090}, {"loss": 1.3833, "grad_norm": 0.766718327999115, "learning_rate": 0.0002, "epoch": 4.666666666666667, "step": 2100}, {"loss": 1.3541, "grad_norm": 0.9162390232086182, "learning_rate": 0.0002, "epoch": 4.688888888888889, "step": 2110}, {"loss": 1.4603, "grad_norm": 0.8980328440666199, "learning_rate": 0.0002, "epoch": 4.711111111111111, "step": 2120}, {"loss": 1.4043, "grad_norm": 0.8109711408615112, "learning_rate": 0.0002, "epoch": 4.733333333333333, "step": 2130}, {"loss": 1.373, "grad_norm": 0.7372606992721558, "learning_rate": 0.0002, "epoch": 4.7555555555555555, "step": 2140}, {"loss": 1.4439, "grad_norm": 0.7527457475662231, "learning_rate": 0.0002, "epoch": 4.777777777777778, "step": 2150}, {"loss": 1.2999, "grad_norm": 1.0380001068115234, "learning_rate": 0.0002, "epoch": 4.8, "step": 2160}, {"loss": 1.3562, "grad_norm": 0.7166368365287781, "learning_rate": 0.0002, "epoch": 4.822222222222222, "step": 2170}, {"loss": 1.3917, "grad_norm": 0.784548282623291, "learning_rate": 0.0002, "epoch": 4.844444444444444, "step": 2180}, {"loss": 1.3376, "grad_norm": 0.7771317958831787, "learning_rate": 0.0002, "epoch": 4.866666666666667, "step": 2190}, {"loss": 1.3315, "grad_norm": 0.7710300087928772, "learning_rate": 0.0002, "epoch": 4.888888888888889, "step": 2200}, {"loss": 1.3676, "grad_norm": 0.7715084552764893, "learning_rate": 0.0002, "epoch": 4.911111111111111, "step": 2210}, {"loss": 1.5352, "grad_norm": 0.7888006567955017, "learning_rate": 0.0002, "epoch": 4.933333333333334, "step": 2220}, {"loss": 1.4139, "grad_norm": 0.800684928894043, "learning_rate": 0.0002, "epoch": 4.955555555555556, "step": 2230}, {"loss": 1.4343, "grad_norm": 0.7710039019584656, "learning_rate": 0.0002, "epoch": 4.977777777777778, "step": 2240}, {"loss": 1.3501, "grad_norm": 0.8617033958435059, "learning_rate": 0.0002, "epoch": 5.0, "step": 2250}]} +{"epoch": 6.0, "step": 2700, "epoch_duration": 489.8840777873993, "total_accumulated_duration": 2919.6755661964417, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5874, "grad_norm": 0.5109436511993408, "learning_rate": 0.0002, "epoch": 0.022222222222222223, "step": 10}, {"loss": 2.2952, "grad_norm": 0.4870035946369171, "learning_rate": 0.0002, "epoch": 0.044444444444444446, "step": 20}, {"loss": 2.0554, "grad_norm": 0.535464882850647, "learning_rate": 0.0002, "epoch": 0.06666666666666667, "step": 30}, {"loss": 2.0067, "grad_norm": 0.49077996611595154, "learning_rate": 0.0002, "epoch": 0.08888888888888889, "step": 40}, {"loss": 2.0673, "grad_norm": 0.4671357572078705, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 50}, {"loss": 1.8751, "grad_norm": 0.4970313608646393, "learning_rate": 0.0002, "epoch": 0.13333333333333333, "step": 60}, {"loss": 1.9419, "grad_norm": 0.4438260495662689, "learning_rate": 0.0002, "epoch": 0.15555555555555556, "step": 70}, {"loss": 1.9856, "grad_norm": 0.5089705586433411, "learning_rate": 0.0002, "epoch": 0.17777777777777778, "step": 80}, {"loss": 1.8805, "grad_norm": 0.4645078182220459, "learning_rate": 0.0002, "epoch": 0.2, "step": 90}, {"loss": 1.807, "grad_norm": 0.46095192432403564, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 100}, {"loss": 1.891, "grad_norm": 0.43338075280189514, "learning_rate": 0.0002, "epoch": 0.24444444444444444, "step": 110}, {"loss": 1.84, "grad_norm": 0.4433900713920593, "learning_rate": 0.0002, "epoch": 0.26666666666666666, "step": 120}, {"loss": 1.8895, "grad_norm": 0.7018499970436096, "learning_rate": 0.0002, "epoch": 0.28888888888888886, "step": 130}, {"loss": 1.8273, "grad_norm": 0.37056994438171387, "learning_rate": 0.0002, "epoch": 0.3111111111111111, "step": 140}, {"loss": 1.8059, "grad_norm": 0.40634623169898987, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 150}, {"loss": 1.8393, "grad_norm": 0.41917353868484497, "learning_rate": 0.0002, "epoch": 0.35555555555555557, "step": 160}, {"loss": 1.8658, "grad_norm": 0.42392489314079285, "learning_rate": 0.0002, "epoch": 0.37777777777777777, "step": 170}, {"loss": 1.8617, "grad_norm": 0.4281010627746582, "learning_rate": 0.0002, "epoch": 0.4, "step": 180}, {"loss": 1.8163, "grad_norm": 0.38542497158050537, "learning_rate": 0.0002, "epoch": 0.4222222222222222, "step": 190}, {"loss": 1.8951, "grad_norm": 0.36003032326698303, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 200}, {"loss": 1.8611, "grad_norm": 0.37858229875564575, "learning_rate": 0.0002, "epoch": 0.4666666666666667, "step": 210}, {"loss": 1.8078, "grad_norm": 0.49986031651496887, "learning_rate": 0.0002, "epoch": 0.4888888888888889, "step": 220}, {"loss": 1.7413, "grad_norm": 0.3937094807624817, "learning_rate": 0.0002, "epoch": 0.5111111111111111, "step": 230}, {"loss": 1.8386, "grad_norm": 0.4566134512424469, "learning_rate": 0.0002, "epoch": 0.5333333333333333, "step": 240}, {"loss": 1.8303, "grad_norm": 0.3602476418018341, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 250}, {"loss": 1.7963, "grad_norm": 0.36321184039115906, "learning_rate": 0.0002, "epoch": 0.5777777777777777, "step": 260}, {"loss": 1.9055, "grad_norm": 0.3808199167251587, "learning_rate": 0.0002, "epoch": 0.6, "step": 270}, {"loss": 1.7916, "grad_norm": 0.38910621404647827, "learning_rate": 0.0002, "epoch": 0.6222222222222222, "step": 280}, {"loss": 1.7958, "grad_norm": 0.31913551688194275, "learning_rate": 0.0002, "epoch": 0.6444444444444445, "step": 290}, {"loss": 1.7468, "grad_norm": 0.34734025597572327, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 300}, {"loss": 1.8092, "grad_norm": 0.3517725467681885, "learning_rate": 0.0002, "epoch": 0.6888888888888889, "step": 310}, {"loss": 1.7847, "grad_norm": 0.3804526627063751, "learning_rate": 0.0002, "epoch": 0.7111111111111111, "step": 320}, {"loss": 1.8131, "grad_norm": 0.5592505931854248, "learning_rate": 0.0002, "epoch": 0.7333333333333333, "step": 330}, {"loss": 1.8827, "grad_norm": 0.36154472827911377, "learning_rate": 0.0002, "epoch": 0.7555555555555555, "step": 340}, {"loss": 1.7764, "grad_norm": 0.43970227241516113, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 350}, {"loss": 1.8333, "grad_norm": 0.3525223731994629, "learning_rate": 0.0002, "epoch": 0.8, "step": 360}, {"loss": 1.7802, "grad_norm": 0.3706997036933899, "learning_rate": 0.0002, "epoch": 0.8222222222222222, "step": 370}, {"loss": 1.7937, "grad_norm": 0.34138166904449463, "learning_rate": 0.0002, "epoch": 0.8444444444444444, "step": 380}, {"loss": 1.8251, "grad_norm": 0.4090622365474701, "learning_rate": 0.0002, "epoch": 0.8666666666666667, "step": 390}, {"loss": 1.7685, "grad_norm": 0.3729974031448364, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 400}, {"loss": 1.836, "grad_norm": 0.3742152452468872, "learning_rate": 0.0002, "epoch": 0.9111111111111111, "step": 410}, {"loss": 1.7998, "grad_norm": 0.37685129046440125, "learning_rate": 0.0002, "epoch": 0.9333333333333333, "step": 420}, {"loss": 1.7601, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.9555555555555556, "step": 430}, {"loss": 1.7651, "grad_norm": 0.31139856576919556, "learning_rate": 0.0002, "epoch": 0.9777777777777777, "step": 440}, {"loss": 1.8471, "grad_norm": 0.3577502965927124, "learning_rate": 0.0002, "epoch": 1.0, "step": 450}, {"eval_loss": 1.8310279846191406, "eval_runtime": 38.8374, "eval_samples_per_second": 13.26, "eval_steps_per_second": 1.674, "epoch": 1.0, "step": 450}, {"loss": 1.7741, "grad_norm": 0.3257788419723511, "learning_rate": 0.0002, "epoch": 1.0222222222222221, "step": 460}, {"loss": 1.8031, "grad_norm": 0.3878970146179199, "learning_rate": 0.0002, "epoch": 1.0444444444444445, "step": 470}, {"loss": 1.701, "grad_norm": 0.364427387714386, "learning_rate": 0.0002, "epoch": 1.0666666666666667, "step": 480}, {"loss": 1.7832, "grad_norm": 0.3374682664871216, "learning_rate": 0.0002, "epoch": 1.0888888888888888, "step": 490}, {"loss": 1.7308, "grad_norm": 0.35822123289108276, "learning_rate": 0.0002, "epoch": 1.1111111111111112, "step": 500}, {"loss": 1.7753, "grad_norm": 0.3748345673084259, "learning_rate": 0.0002, "epoch": 1.1333333333333333, "step": 510}, {"loss": 1.7225, "grad_norm": 0.3422437012195587, "learning_rate": 0.0002, "epoch": 1.1555555555555554, "step": 520}, {"loss": 1.7649, "grad_norm": 0.4289326071739197, "learning_rate": 0.0002, "epoch": 1.1777777777777778, "step": 530}, {"loss": 1.7548, "grad_norm": 0.3706769645214081, "learning_rate": 0.0002, "epoch": 1.2, "step": 540}, {"loss": 1.6988, "grad_norm": 0.4024733603000641, "learning_rate": 0.0002, "epoch": 1.2222222222222223, "step": 550}, {"loss": 1.7641, "grad_norm": 0.3960128128528595, "learning_rate": 0.0002, "epoch": 1.2444444444444445, "step": 560}, {"loss": 1.755, "grad_norm": 0.38222864270210266, "learning_rate": 0.0002, "epoch": 1.2666666666666666, "step": 570}, {"loss": 1.735, "grad_norm": 0.4073713421821594, "learning_rate": 0.0002, "epoch": 1.2888888888888888, "step": 580}, {"loss": 1.7183, "grad_norm": 0.3875499963760376, "learning_rate": 0.0002, "epoch": 1.3111111111111111, "step": 590}, {"loss": 1.7492, "grad_norm": 0.39740806818008423, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 600}, {"loss": 1.8478, "grad_norm": 0.38432490825653076, "learning_rate": 0.0002, "epoch": 1.3555555555555556, "step": 610}, {"loss": 1.7327, "grad_norm": 0.402729868888855, "learning_rate": 0.0002, "epoch": 1.3777777777777778, "step": 620}, {"loss": 1.6634, "grad_norm": 0.36683231592178345, "learning_rate": 0.0002, "epoch": 1.4, "step": 630}, {"loss": 1.8059, "grad_norm": 0.3883286714553833, "learning_rate": 0.0002, "epoch": 1.4222222222222223, "step": 640}, {"loss": 1.7953, "grad_norm": 0.4087409973144531, "learning_rate": 0.0002, "epoch": 1.4444444444444444, "step": 650}, {"loss": 1.7491, "grad_norm": 0.4042017459869385, "learning_rate": 0.0002, "epoch": 1.4666666666666668, "step": 660}, {"loss": 1.7466, "grad_norm": 0.40149256587028503, "learning_rate": 0.0002, "epoch": 1.488888888888889, "step": 670}, {"loss": 1.7398, "grad_norm": 0.45146510004997253, "learning_rate": 0.0002, "epoch": 1.511111111111111, "step": 680}, {"loss": 1.6923, "grad_norm": 0.4098089039325714, "learning_rate": 0.0002, "epoch": 1.5333333333333332, "step": 690}, {"loss": 1.6847, "grad_norm": 0.4181336760520935, "learning_rate": 0.0002, "epoch": 1.5555555555555556, "step": 700}, {"loss": 1.7862, "grad_norm": 1.3722974061965942, "learning_rate": 0.0002, "epoch": 1.5777777777777777, "step": 710}, {"loss": 1.7424, "grad_norm": 0.3965230882167816, "learning_rate": 0.0002, "epoch": 1.6, "step": 720}, {"loss": 1.7782, "grad_norm": 0.3842000663280487, "learning_rate": 0.0002, "epoch": 1.6222222222222222, "step": 730}, {"loss": 1.7682, "grad_norm": 0.3603688180446625, "learning_rate": 0.0002, "epoch": 1.6444444444444444, "step": 740}, {"loss": 1.7476, "grad_norm": 0.39973509311676025, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 750}, {"loss": 1.7329, "grad_norm": 0.3687385618686676, "learning_rate": 0.0002, "epoch": 1.6888888888888889, "step": 760}, {"loss": 1.7987, "grad_norm": 0.4267722964286804, "learning_rate": 0.0002, "epoch": 1.7111111111111112, "step": 770}, {"loss": 1.8041, "grad_norm": 0.41301295161247253, "learning_rate": 0.0002, "epoch": 1.7333333333333334, "step": 780}, {"loss": 1.7284, "grad_norm": 0.3945430517196655, "learning_rate": 0.0002, "epoch": 1.7555555555555555, "step": 790}, {"loss": 1.6691, "grad_norm": 0.4037930965423584, "learning_rate": 0.0002, "epoch": 1.7777777777777777, "step": 800}, {"loss": 1.6874, "grad_norm": 0.406893253326416, "learning_rate": 0.0002, "epoch": 1.8, "step": 810}, {"loss": 1.7628, "grad_norm": 0.4600457549095154, "learning_rate": 0.0002, "epoch": 1.8222222222222222, "step": 820}, {"loss": 1.8222, "grad_norm": 0.4195384085178375, "learning_rate": 0.0002, "epoch": 1.8444444444444446, "step": 830}, {"loss": 1.7123, "grad_norm": 0.3854130506515503, "learning_rate": 0.0002, "epoch": 1.8666666666666667, "step": 840}, {"loss": 1.7227, "grad_norm": 0.38279038667678833, "learning_rate": 0.0002, "epoch": 1.8888888888888888, "step": 850}, {"loss": 1.6942, "grad_norm": 0.38249439001083374, "learning_rate": 0.0002, "epoch": 1.911111111111111, "step": 860}, {"loss": 1.8058, "grad_norm": 0.42977792024612427, "learning_rate": 0.0002, "epoch": 1.9333333333333333, "step": 870}, {"loss": 1.713, "grad_norm": 0.4109351933002472, "learning_rate": 0.0002, "epoch": 1.9555555555555557, "step": 880}, {"loss": 1.7035, "grad_norm": 0.3734486699104309, "learning_rate": 0.0002, "epoch": 1.9777777777777779, "step": 890}, {"loss": 1.7502, "grad_norm": 0.3603087067604065, "learning_rate": 0.0002, "epoch": 2.0, "step": 900}, {"eval_loss": 1.8250652551651, "eval_runtime": 38.8657, "eval_samples_per_second": 13.251, "eval_steps_per_second": 1.672, "epoch": 2.0, "step": 900}, {"loss": 1.6063, "grad_norm": 0.4014144241809845, "learning_rate": 0.0002, "epoch": 2.022222222222222, "step": 910}, {"loss": 1.6604, "grad_norm": 0.4338063597679138, "learning_rate": 0.0002, "epoch": 2.0444444444444443, "step": 920}, {"loss": 1.6354, "grad_norm": 0.3693605065345764, "learning_rate": 0.0002, "epoch": 2.066666666666667, "step": 930}, {"loss": 1.6186, "grad_norm": 0.4040255844593048, "learning_rate": 0.0002, "epoch": 2.088888888888889, "step": 940}, {"loss": 1.5976, "grad_norm": 0.43481820821762085, "learning_rate": 0.0002, "epoch": 2.111111111111111, "step": 950}, {"loss": 1.6081, "grad_norm": 0.41632869839668274, "learning_rate": 0.0002, "epoch": 2.1333333333333333, "step": 960}, {"loss": 1.6586, "grad_norm": 0.4633755385875702, "learning_rate": 0.0002, "epoch": 2.1555555555555554, "step": 970}, {"loss": 1.7396, "grad_norm": 0.43926581740379333, "learning_rate": 0.0002, "epoch": 2.1777777777777776, "step": 980}, {"loss": 1.5909, "grad_norm": 0.4757233262062073, "learning_rate": 0.0002, "epoch": 2.2, "step": 990}, {"loss": 1.648, "grad_norm": 0.5010586977005005, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 1000}, {"loss": 1.6454, "grad_norm": 0.44900986552238464, "learning_rate": 0.0002, "epoch": 2.2444444444444445, "step": 1010}, {"loss": 1.6493, "grad_norm": 0.41274750232696533, "learning_rate": 0.0002, "epoch": 2.2666666666666666, "step": 1020}, {"loss": 1.6592, "grad_norm": 0.44672393798828125, "learning_rate": 0.0002, "epoch": 2.2888888888888888, "step": 1030}, {"loss": 1.6349, "grad_norm": 0.4826269745826721, "learning_rate": 0.0002, "epoch": 2.311111111111111, "step": 1040}, {"loss": 1.5988, "grad_norm": 0.4650685489177704, "learning_rate": 0.0002, "epoch": 2.3333333333333335, "step": 1050}, {"loss": 1.5859, "grad_norm": 0.42507848143577576, "learning_rate": 0.0002, "epoch": 2.3555555555555556, "step": 1060}, {"loss": 1.5932, "grad_norm": 0.45653030276298523, "learning_rate": 0.0002, "epoch": 2.3777777777777778, "step": 1070}, {"loss": 1.6469, "grad_norm": 0.44534122943878174, "learning_rate": 0.0002, "epoch": 2.4, "step": 1080}, {"loss": 1.6115, "grad_norm": 0.4241289794445038, "learning_rate": 0.0002, "epoch": 2.422222222222222, "step": 1090}, {"loss": 1.6935, "grad_norm": 0.5004808306694031, "learning_rate": 0.0002, "epoch": 2.4444444444444446, "step": 1100}, {"loss": 1.6833, "grad_norm": 0.41425490379333496, "learning_rate": 0.0002, "epoch": 2.466666666666667, "step": 1110}, {"loss": 1.6151, "grad_norm": 0.44362279772758484, "learning_rate": 0.0002, "epoch": 2.488888888888889, "step": 1120}, {"loss": 1.6394, "grad_norm": 0.5530985593795776, "learning_rate": 0.0002, "epoch": 2.511111111111111, "step": 1130}, {"loss": 1.64, "grad_norm": 0.4290637969970703, "learning_rate": 0.0002, "epoch": 2.533333333333333, "step": 1140}, {"loss": 1.76, "grad_norm": 0.4957487881183624, "learning_rate": 0.0002, "epoch": 2.5555555555555554, "step": 1150}, {"loss": 1.613, "grad_norm": 0.5082747340202332, "learning_rate": 0.0002, "epoch": 2.5777777777777775, "step": 1160}, {"loss": 1.6702, "grad_norm": 0.478722482919693, "learning_rate": 0.0002, "epoch": 2.6, "step": 1170}, {"loss": 1.6198, "grad_norm": 0.436454176902771, "learning_rate": 0.0002, "epoch": 2.6222222222222222, "step": 1180}, {"loss": 1.663, "grad_norm": 0.4905032515525818, "learning_rate": 0.0002, "epoch": 2.6444444444444444, "step": 1190}, {"loss": 1.6216, "grad_norm": 0.4815700054168701, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 1200}, {"loss": 1.6994, "grad_norm": 0.3965534269809723, "learning_rate": 0.0002, "epoch": 2.688888888888889, "step": 1210}, {"loss": 1.744, "grad_norm": 0.43282169103622437, "learning_rate": 0.0002, "epoch": 2.7111111111111112, "step": 1220}, {"loss": 1.6455, "grad_norm": 0.45512479543685913, "learning_rate": 0.0002, "epoch": 2.7333333333333334, "step": 1230}, {"loss": 1.6444, "grad_norm": 0.44370076060295105, "learning_rate": 0.0002, "epoch": 2.7555555555555555, "step": 1240}, {"loss": 1.7106, "grad_norm": 0.4750686287879944, "learning_rate": 0.0002, "epoch": 2.7777777777777777, "step": 1250}, {"loss": 1.7122, "grad_norm": 0.41953766345977783, "learning_rate": 0.0002, "epoch": 2.8, "step": 1260}, {"loss": 1.6203, "grad_norm": 0.4887140095233917, "learning_rate": 0.0002, "epoch": 2.822222222222222, "step": 1270}, {"loss": 1.6691, "grad_norm": 0.46718958020210266, "learning_rate": 0.0002, "epoch": 2.8444444444444446, "step": 1280}, {"loss": 1.6257, "grad_norm": 0.48510900139808655, "learning_rate": 0.0002, "epoch": 2.8666666666666667, "step": 1290}, {"loss": 1.6825, "grad_norm": 0.4504084289073944, "learning_rate": 0.0002, "epoch": 2.888888888888889, "step": 1300}, {"loss": 1.7453, "grad_norm": 0.42119622230529785, "learning_rate": 0.0002, "epoch": 2.911111111111111, "step": 1310}, {"loss": 1.6662, "grad_norm": 0.4763694107532501, "learning_rate": 0.0002, "epoch": 2.9333333333333336, "step": 1320}, {"loss": 1.6151, "grad_norm": 0.422810822725296, "learning_rate": 0.0002, "epoch": 2.9555555555555557, "step": 1330}, {"loss": 1.6578, "grad_norm": 0.4768871068954468, "learning_rate": 0.0002, "epoch": 2.977777777777778, "step": 1340}, {"loss": 1.6167, "grad_norm": 0.48259881138801575, "learning_rate": 0.0002, "epoch": 3.0, "step": 1350}, {"eval_loss": 1.8452560901641846, "eval_runtime": 38.8621, "eval_samples_per_second": 13.252, "eval_steps_per_second": 1.673, "epoch": 3.0, "step": 1350}, {"loss": 1.5351, "grad_norm": 0.6933313012123108, "learning_rate": 0.0002, "epoch": 3.022222222222222, "step": 1360}, {"loss": 1.5542, "grad_norm": 0.5870710611343384, "learning_rate": 0.0002, "epoch": 3.0444444444444443, "step": 1370}, {"loss": 1.511, "grad_norm": 0.602210283279419, "learning_rate": 0.0002, "epoch": 3.066666666666667, "step": 1380}, {"loss": 1.5272, "grad_norm": 0.6461787819862366, "learning_rate": 0.0002, "epoch": 3.088888888888889, "step": 1390}, {"loss": 1.4813, "grad_norm": 0.5839587450027466, "learning_rate": 0.0002, "epoch": 3.111111111111111, "step": 1400}, {"loss": 1.505, "grad_norm": 0.5757876038551331, "learning_rate": 0.0002, "epoch": 3.1333333333333333, "step": 1410}, {"loss": 1.4963, "grad_norm": 0.5862616300582886, "learning_rate": 0.0002, "epoch": 3.1555555555555554, "step": 1420}, {"loss": 1.5144, "grad_norm": 0.6103630065917969, "learning_rate": 0.0002, "epoch": 3.1777777777777776, "step": 1430}, {"loss": 1.5406, "grad_norm": 0.9309254884719849, "learning_rate": 0.0002, "epoch": 3.2, "step": 1440}, {"loss": 1.487, "grad_norm": 0.5360018014907837, "learning_rate": 0.0002, "epoch": 3.2222222222222223, "step": 1450}, {"loss": 1.5659, "grad_norm": 0.5448758602142334, "learning_rate": 0.0002, "epoch": 3.2444444444444445, "step": 1460}, {"loss": 1.5595, "grad_norm": 0.5973812341690063, "learning_rate": 0.0002, "epoch": 3.2666666666666666, "step": 1470}, {"loss": 1.5223, "grad_norm": 0.6245622038841248, "learning_rate": 0.0002, "epoch": 3.2888888888888888, "step": 1480}, {"loss": 1.4795, "grad_norm": 0.6533768773078918, "learning_rate": 0.0002, "epoch": 3.311111111111111, "step": 1490}, {"loss": 1.5562, "grad_norm": 0.5765811204910278, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 1500}, {"loss": 1.5405, "grad_norm": 0.591395378112793, "learning_rate": 0.0002, "epoch": 3.3555555555555556, "step": 1510}, {"loss": 1.5658, "grad_norm": 0.5842425227165222, "learning_rate": 0.0002, "epoch": 3.3777777777777778, "step": 1520}, {"loss": 1.5065, "grad_norm": 0.5731365084648132, "learning_rate": 0.0002, "epoch": 3.4, "step": 1530}, {"loss": 1.5438, "grad_norm": 0.5841306447982788, "learning_rate": 0.0002, "epoch": 3.422222222222222, "step": 1540}, {"loss": 1.4922, "grad_norm": 0.6503536701202393, "learning_rate": 0.0002, "epoch": 3.4444444444444446, "step": 1550}, {"loss": 1.5493, "grad_norm": 0.6170967221260071, "learning_rate": 0.0002, "epoch": 3.466666666666667, "step": 1560}, {"loss": 1.5098, "grad_norm": 0.5576487183570862, "learning_rate": 0.0002, "epoch": 3.488888888888889, "step": 1570}, {"loss": 1.472, "grad_norm": 0.7082911133766174, "learning_rate": 0.0002, "epoch": 3.511111111111111, "step": 1580}, {"loss": 1.5594, "grad_norm": 0.6159376502037048, "learning_rate": 0.0002, "epoch": 3.533333333333333, "step": 1590}, {"loss": 1.563, "grad_norm": 0.5972959399223328, "learning_rate": 0.0002, "epoch": 3.5555555555555554, "step": 1600}, {"loss": 1.4876, "grad_norm": 0.5787310004234314, "learning_rate": 0.0002, "epoch": 3.5777777777777775, "step": 1610}, {"loss": 1.4887, "grad_norm": 0.5846341252326965, "learning_rate": 0.0002, "epoch": 3.6, "step": 1620}, {"loss": 1.542, "grad_norm": 0.5906197428703308, "learning_rate": 0.0002, "epoch": 3.6222222222222222, "step": 1630}, {"loss": 1.4941, "grad_norm": 0.6305760145187378, "learning_rate": 0.0002, "epoch": 3.6444444444444444, "step": 1640}, {"loss": 1.4677, "grad_norm": 0.7448979616165161, "learning_rate": 0.0002, "epoch": 3.6666666666666665, "step": 1650}, {"loss": 1.5961, "grad_norm": 0.5906165242195129, "learning_rate": 0.0002, "epoch": 3.688888888888889, "step": 1660}, {"loss": 1.4882, "grad_norm": 0.605032742023468, "learning_rate": 0.0002, "epoch": 3.7111111111111112, "step": 1670}, {"loss": 1.5804, "grad_norm": 0.6117229461669922, "learning_rate": 0.0002, "epoch": 3.7333333333333334, "step": 1680}, {"loss": 1.5131, "grad_norm": 0.613581120967865, "learning_rate": 0.0002, "epoch": 3.7555555555555555, "step": 1690}, {"loss": 1.5074, "grad_norm": 0.6244436502456665, "learning_rate": 0.0002, "epoch": 3.7777777777777777, "step": 1700}, {"loss": 1.5738, "grad_norm": 0.6236702799797058, "learning_rate": 0.0002, "epoch": 3.8, "step": 1710}, {"loss": 1.6542, "grad_norm": 0.639141857624054, "learning_rate": 0.0002, "epoch": 3.822222222222222, "step": 1720}, {"loss": 1.536, "grad_norm": 0.5782344937324524, "learning_rate": 0.0002, "epoch": 3.8444444444444446, "step": 1730}, {"loss": 1.5355, "grad_norm": 0.5952938795089722, "learning_rate": 0.0002, "epoch": 3.8666666666666667, "step": 1740}, {"loss": 1.5205, "grad_norm": 0.5573042035102844, "learning_rate": 0.0002, "epoch": 3.888888888888889, "step": 1750}, {"loss": 1.5066, "grad_norm": 0.6114351749420166, "learning_rate": 0.0002, "epoch": 3.911111111111111, "step": 1760}, {"loss": 1.5706, "grad_norm": 0.5973817110061646, "learning_rate": 0.0002, "epoch": 3.9333333333333336, "step": 1770}, {"loss": 1.5003, "grad_norm": 0.602317750453949, "learning_rate": 0.0002, "epoch": 3.9555555555555557, "step": 1780}, {"loss": 1.5022, "grad_norm": 0.5965437293052673, "learning_rate": 0.0002, "epoch": 3.977777777777778, "step": 1790}, {"loss": 1.5031, "grad_norm": 0.5641552209854126, "learning_rate": 0.0002, "epoch": 4.0, "step": 1800}, {"eval_loss": 1.892098069190979, "eval_runtime": 38.8755, "eval_samples_per_second": 13.247, "eval_steps_per_second": 1.672, "epoch": 4.0, "step": 1800}, {"loss": 1.3894, "grad_norm": 0.8302594423294067, "learning_rate": 0.0002, "epoch": 4.022222222222222, "step": 1810}, {"loss": 1.3727, "grad_norm": 0.6695230603218079, "learning_rate": 0.0002, "epoch": 4.044444444444444, "step": 1820}, {"loss": 1.3064, "grad_norm": 0.7911471128463745, "learning_rate": 0.0002, "epoch": 4.066666666666666, "step": 1830}, {"loss": 1.4574, "grad_norm": 0.7044888138771057, "learning_rate": 0.0002, "epoch": 4.088888888888889, "step": 1840}, {"loss": 1.3941, "grad_norm": 0.7057249546051025, "learning_rate": 0.0002, "epoch": 4.111111111111111, "step": 1850}, {"loss": 1.4052, "grad_norm": 0.8762815594673157, "learning_rate": 0.0002, "epoch": 4.133333333333334, "step": 1860}, {"loss": 1.3784, "grad_norm": 0.7619158029556274, "learning_rate": 0.0002, "epoch": 4.155555555555556, "step": 1870}, {"loss": 1.3581, "grad_norm": 0.7711658477783203, "learning_rate": 0.0002, "epoch": 4.177777777777778, "step": 1880}, {"loss": 1.3995, "grad_norm": 0.9732598662376404, "learning_rate": 0.0002, "epoch": 4.2, "step": 1890}, {"loss": 1.3353, "grad_norm": 0.9070265889167786, "learning_rate": 0.0002, "epoch": 4.222222222222222, "step": 1900}, {"loss": 1.3947, "grad_norm": 0.8274767994880676, "learning_rate": 0.0002, "epoch": 4.2444444444444445, "step": 1910}, {"loss": 1.3392, "grad_norm": 0.8514227271080017, "learning_rate": 0.0002, "epoch": 4.266666666666667, "step": 1920}, {"loss": 1.3492, "grad_norm": 0.7356534600257874, "learning_rate": 0.0002, "epoch": 4.288888888888889, "step": 1930}, {"loss": 1.3708, "grad_norm": 0.8226608037948608, "learning_rate": 0.0002, "epoch": 4.311111111111111, "step": 1940}, {"loss": 1.3652, "grad_norm": 0.8347907066345215, "learning_rate": 0.0002, "epoch": 4.333333333333333, "step": 1950}, {"loss": 1.3415, "grad_norm": 0.8509323000907898, "learning_rate": 0.0002, "epoch": 4.355555555555555, "step": 1960}, {"loss": 1.3796, "grad_norm": 0.8776063323020935, "learning_rate": 0.0002, "epoch": 4.377777777777778, "step": 1970}, {"loss": 1.438, "grad_norm": 0.8022271990776062, "learning_rate": 0.0002, "epoch": 4.4, "step": 1980}, {"loss": 1.3671, "grad_norm": 0.7984752058982849, "learning_rate": 0.0002, "epoch": 4.4222222222222225, "step": 1990}, {"loss": 1.4214, "grad_norm": 0.7349720001220703, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 2000}, {"loss": 1.4174, "grad_norm": 0.7778817415237427, "learning_rate": 0.0002, "epoch": 4.466666666666667, "step": 2010}, {"loss": 1.3365, "grad_norm": 0.9361467361450195, "learning_rate": 0.0002, "epoch": 4.488888888888889, "step": 2020}, {"loss": 1.4129, "grad_norm": 0.7839348912239075, "learning_rate": 0.0002, "epoch": 4.511111111111111, "step": 2030}, {"loss": 1.3761, "grad_norm": 0.8361981511116028, "learning_rate": 0.0002, "epoch": 4.533333333333333, "step": 2040}, {"loss": 1.4085, "grad_norm": 1.9877147674560547, "learning_rate": 0.0002, "epoch": 4.555555555555555, "step": 2050}, {"loss": 1.329, "grad_norm": 0.7506140470504761, "learning_rate": 0.0002, "epoch": 4.5777777777777775, "step": 2060}, {"loss": 1.3557, "grad_norm": 0.9493570327758789, "learning_rate": 0.0002, "epoch": 4.6, "step": 2070}, {"loss": 1.438, "grad_norm": 0.7198925018310547, "learning_rate": 0.0002, "epoch": 4.622222222222222, "step": 2080}, {"loss": 1.3892, "grad_norm": 0.7521472573280334, "learning_rate": 0.0002, "epoch": 4.644444444444445, "step": 2090}, {"loss": 1.3833, "grad_norm": 0.766718327999115, "learning_rate": 0.0002, "epoch": 4.666666666666667, "step": 2100}, {"loss": 1.3541, "grad_norm": 0.9162390232086182, "learning_rate": 0.0002, "epoch": 4.688888888888889, "step": 2110}, {"loss": 1.4603, "grad_norm": 0.8980328440666199, "learning_rate": 0.0002, "epoch": 4.711111111111111, "step": 2120}, {"loss": 1.4043, "grad_norm": 0.8109711408615112, "learning_rate": 0.0002, "epoch": 4.733333333333333, "step": 2130}, {"loss": 1.373, "grad_norm": 0.7372606992721558, "learning_rate": 0.0002, "epoch": 4.7555555555555555, "step": 2140}, {"loss": 1.4439, "grad_norm": 0.7527457475662231, "learning_rate": 0.0002, "epoch": 4.777777777777778, "step": 2150}, {"loss": 1.2999, "grad_norm": 1.0380001068115234, "learning_rate": 0.0002, "epoch": 4.8, "step": 2160}, {"loss": 1.3562, "grad_norm": 0.7166368365287781, "learning_rate": 0.0002, "epoch": 4.822222222222222, "step": 2170}, {"loss": 1.3917, "grad_norm": 0.784548282623291, "learning_rate": 0.0002, "epoch": 4.844444444444444, "step": 2180}, {"loss": 1.3376, "grad_norm": 0.7771317958831787, "learning_rate": 0.0002, "epoch": 4.866666666666667, "step": 2190}, {"loss": 1.3315, "grad_norm": 0.7710300087928772, "learning_rate": 0.0002, "epoch": 4.888888888888889, "step": 2200}, {"loss": 1.3676, "grad_norm": 0.7715084552764893, "learning_rate": 0.0002, "epoch": 4.911111111111111, "step": 2210}, {"loss": 1.5352, "grad_norm": 0.7888006567955017, "learning_rate": 0.0002, "epoch": 4.933333333333334, "step": 2220}, {"loss": 1.4139, "grad_norm": 0.800684928894043, "learning_rate": 0.0002, "epoch": 4.955555555555556, "step": 2230}, {"loss": 1.4343, "grad_norm": 0.7710039019584656, "learning_rate": 0.0002, "epoch": 4.977777777777778, "step": 2240}, {"loss": 1.3501, "grad_norm": 0.8617033958435059, "learning_rate": 0.0002, "epoch": 5.0, "step": 2250}, {"eval_loss": 1.9718151092529297, "eval_runtime": 38.8999, "eval_samples_per_second": 13.239, "eval_steps_per_second": 1.671, "epoch": 5.0, "step": 2250}, {"loss": 1.19, "grad_norm": 1.07399582862854, "learning_rate": 0.0002, "epoch": 5.022222222222222, "step": 2260}, {"loss": 1.2299, "grad_norm": 0.6598460674285889, "learning_rate": 0.0002, "epoch": 5.044444444444444, "step": 2270}, {"loss": 1.2333, "grad_norm": 1.1039506196975708, "learning_rate": 0.0002, "epoch": 5.066666666666666, "step": 2280}, {"loss": 1.2412, "grad_norm": 1.0624054670333862, "learning_rate": 0.0002, "epoch": 5.088888888888889, "step": 2290}, {"loss": 1.184, "grad_norm": 0.849583625793457, "learning_rate": 0.0002, "epoch": 5.111111111111111, "step": 2300}, {"loss": 1.1884, "grad_norm": 1.0143699645996094, "learning_rate": 0.0002, "epoch": 5.133333333333334, "step": 2310}, {"loss": 1.2133, "grad_norm": 0.8990702629089355, "learning_rate": 0.0002, "epoch": 5.155555555555556, "step": 2320}, {"loss": 1.2091, "grad_norm": 0.9822764992713928, "learning_rate": 0.0002, "epoch": 5.177777777777778, "step": 2330}, {"loss": 1.1775, "grad_norm": 0.9632459282875061, "learning_rate": 0.0002, "epoch": 5.2, "step": 2340}, {"loss": 1.1821, "grad_norm": 1.0897903442382812, "learning_rate": 0.0002, "epoch": 5.222222222222222, "step": 2350}, {"loss": 1.2976, "grad_norm": 1.155950665473938, "learning_rate": 0.0002, "epoch": 5.2444444444444445, "step": 2360}, {"loss": 1.1662, "grad_norm": 1.0566821098327637, "learning_rate": 0.0002, "epoch": 5.266666666666667, "step": 2370}, {"loss": 1.2809, "grad_norm": 1.191604733467102, "learning_rate": 0.0002, "epoch": 5.288888888888889, "step": 2380}, {"loss": 1.2431, "grad_norm": 0.852453887462616, "learning_rate": 0.0002, "epoch": 5.311111111111111, "step": 2390}, {"loss": 1.2106, "grad_norm": 0.9649669528007507, "learning_rate": 0.0002, "epoch": 5.333333333333333, "step": 2400}, {"loss": 1.2433, "grad_norm": 1.0731003284454346, "learning_rate": 0.0002, "epoch": 5.355555555555555, "step": 2410}, {"loss": 1.1737, "grad_norm": 0.9628495573997498, "learning_rate": 0.0002, "epoch": 5.377777777777778, "step": 2420}, {"loss": 1.3166, "grad_norm": 0.9268819093704224, "learning_rate": 0.0002, "epoch": 5.4, "step": 2430}, {"loss": 1.2114, "grad_norm": 1.1104000806808472, "learning_rate": 0.0002, "epoch": 5.4222222222222225, "step": 2440}, {"loss": 1.2151, "grad_norm": 1.0439373254776, "learning_rate": 0.0002, "epoch": 5.444444444444445, "step": 2450}, {"loss": 1.2458, "grad_norm": 1.0366657972335815, "learning_rate": 0.0002, "epoch": 5.466666666666667, "step": 2460}, {"loss": 1.2021, "grad_norm": 1.0604808330535889, "learning_rate": 0.0002, "epoch": 5.488888888888889, "step": 2470}, {"loss": 1.2188, "grad_norm": 0.8845253586769104, "learning_rate": 0.0002, "epoch": 5.511111111111111, "step": 2480}, {"loss": 1.2296, "grad_norm": 0.8200256824493408, "learning_rate": 0.0002, "epoch": 5.533333333333333, "step": 2490}, {"loss": 1.2632, "grad_norm": 0.9628723859786987, "learning_rate": 0.0002, "epoch": 5.555555555555555, "step": 2500}, {"loss": 1.2723, "grad_norm": 1.0758650302886963, "learning_rate": 0.0002, "epoch": 5.5777777777777775, "step": 2510}, {"loss": 1.2298, "grad_norm": 1.0113487243652344, "learning_rate": 0.0002, "epoch": 5.6, "step": 2520}, {"loss": 1.2226, "grad_norm": 1.260536551475525, "learning_rate": 0.0002, "epoch": 5.622222222222222, "step": 2530}, {"loss": 1.227, "grad_norm": 0.9229527115821838, "learning_rate": 0.0002, "epoch": 5.644444444444445, "step": 2540}, {"loss": 1.2223, "grad_norm": 0.9378697276115417, "learning_rate": 0.0002, "epoch": 5.666666666666667, "step": 2550}, {"loss": 1.2759, "grad_norm": 1.0404350757598877, "learning_rate": 0.0002, "epoch": 5.688888888888889, "step": 2560}, {"loss": 1.2132, "grad_norm": 1.1879961490631104, "learning_rate": 0.0002, "epoch": 5.711111111111111, "step": 2570}, {"loss": 1.2181, "grad_norm": 0.8881482481956482, "learning_rate": 0.0002, "epoch": 5.733333333333333, "step": 2580}, {"loss": 1.2419, "grad_norm": 1.1428065299987793, "learning_rate": 0.0002, "epoch": 5.7555555555555555, "step": 2590}, {"loss": 1.2682, "grad_norm": 0.8970609903335571, "learning_rate": 0.0002, "epoch": 5.777777777777778, "step": 2600}, {"loss": 1.2285, "grad_norm": 1.2084497213363647, "learning_rate": 0.0002, "epoch": 5.8, "step": 2610}, {"loss": 1.2004, "grad_norm": 1.04214608669281, "learning_rate": 0.0002, "epoch": 5.822222222222222, "step": 2620}, {"loss": 1.2388, "grad_norm": 1.0671849250793457, "learning_rate": 0.0002, "epoch": 5.844444444444444, "step": 2630}, {"loss": 1.1714, "grad_norm": 1.009602427482605, "learning_rate": 0.0002, "epoch": 5.866666666666667, "step": 2640}, {"loss": 1.2292, "grad_norm": 0.9787904024124146, "learning_rate": 0.0002, "epoch": 5.888888888888889, "step": 2650}, {"loss": 1.2404, "grad_norm": 1.0043761730194092, "learning_rate": 0.0002, "epoch": 5.911111111111111, "step": 2660}, {"loss": 1.2712, "grad_norm": 0.9855443239212036, "learning_rate": 0.0002, "epoch": 5.933333333333334, "step": 2670}, {"loss": 1.3112, "grad_norm": 1.1488507986068726, "learning_rate": 0.0002, "epoch": 5.955555555555556, "step": 2680}, {"loss": 1.2576, "grad_norm": 0.9939966797828674, "learning_rate": 0.0002, "epoch": 5.977777777777778, "step": 2690}, {"loss": 1.2847, "grad_norm": 1.0444952249526978, "learning_rate": 0.0002, "epoch": 6.0, "step": 2700}]} +{"epoch": 7.0, "step": 3150, "epoch_duration": 867.1775352954865, "total_accumulated_duration": 3786.853101491928, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5874, "grad_norm": 0.5109436511993408, "learning_rate": 0.0002, "epoch": 0.022222222222222223, "step": 10}, {"loss": 2.2952, "grad_norm": 0.4870035946369171, "learning_rate": 0.0002, "epoch": 0.044444444444444446, "step": 20}, {"loss": 2.0554, "grad_norm": 0.535464882850647, "learning_rate": 0.0002, "epoch": 0.06666666666666667, "step": 30}, {"loss": 2.0067, "grad_norm": 0.49077996611595154, "learning_rate": 0.0002, "epoch": 0.08888888888888889, "step": 40}, {"loss": 2.0673, "grad_norm": 0.4671357572078705, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 50}, {"loss": 1.8751, "grad_norm": 0.4970313608646393, "learning_rate": 0.0002, "epoch": 0.13333333333333333, "step": 60}, {"loss": 1.9419, "grad_norm": 0.4438260495662689, "learning_rate": 0.0002, "epoch": 0.15555555555555556, "step": 70}, {"loss": 1.9856, "grad_norm": 0.5089705586433411, "learning_rate": 0.0002, "epoch": 0.17777777777777778, "step": 80}, {"loss": 1.8805, "grad_norm": 0.4645078182220459, "learning_rate": 0.0002, "epoch": 0.2, "step": 90}, {"loss": 1.807, "grad_norm": 0.46095192432403564, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 100}, {"loss": 1.891, "grad_norm": 0.43338075280189514, "learning_rate": 0.0002, "epoch": 0.24444444444444444, "step": 110}, {"loss": 1.84, "grad_norm": 0.4433900713920593, "learning_rate": 0.0002, "epoch": 0.26666666666666666, "step": 120}, {"loss": 1.8895, "grad_norm": 0.7018499970436096, "learning_rate": 0.0002, "epoch": 0.28888888888888886, "step": 130}, {"loss": 1.8273, "grad_norm": 0.37056994438171387, "learning_rate": 0.0002, "epoch": 0.3111111111111111, "step": 140}, {"loss": 1.8059, "grad_norm": 0.40634623169898987, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 150}, {"loss": 1.8393, "grad_norm": 0.41917353868484497, "learning_rate": 0.0002, "epoch": 0.35555555555555557, "step": 160}, {"loss": 1.8658, "grad_norm": 0.42392489314079285, "learning_rate": 0.0002, "epoch": 0.37777777777777777, "step": 170}, {"loss": 1.8617, "grad_norm": 0.4281010627746582, "learning_rate": 0.0002, "epoch": 0.4, "step": 180}, {"loss": 1.8163, "grad_norm": 0.38542497158050537, "learning_rate": 0.0002, "epoch": 0.4222222222222222, "step": 190}, {"loss": 1.8951, "grad_norm": 0.36003032326698303, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 200}, {"loss": 1.8611, "grad_norm": 0.37858229875564575, "learning_rate": 0.0002, "epoch": 0.4666666666666667, "step": 210}, {"loss": 1.8078, "grad_norm": 0.49986031651496887, "learning_rate": 0.0002, "epoch": 0.4888888888888889, "step": 220}, {"loss": 1.7413, "grad_norm": 0.3937094807624817, "learning_rate": 0.0002, "epoch": 0.5111111111111111, "step": 230}, {"loss": 1.8386, "grad_norm": 0.4566134512424469, "learning_rate": 0.0002, "epoch": 0.5333333333333333, "step": 240}, {"loss": 1.8303, "grad_norm": 0.3602476418018341, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 250}, {"loss": 1.7963, "grad_norm": 0.36321184039115906, "learning_rate": 0.0002, "epoch": 0.5777777777777777, "step": 260}, {"loss": 1.9055, "grad_norm": 0.3808199167251587, "learning_rate": 0.0002, "epoch": 0.6, "step": 270}, {"loss": 1.7916, "grad_norm": 0.38910621404647827, "learning_rate": 0.0002, "epoch": 0.6222222222222222, "step": 280}, {"loss": 1.7958, "grad_norm": 0.31913551688194275, "learning_rate": 0.0002, "epoch": 0.6444444444444445, "step": 290}, {"loss": 1.7468, "grad_norm": 0.34734025597572327, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 300}, {"loss": 1.8092, "grad_norm": 0.3517725467681885, "learning_rate": 0.0002, "epoch": 0.6888888888888889, "step": 310}, {"loss": 1.7847, "grad_norm": 0.3804526627063751, "learning_rate": 0.0002, "epoch": 0.7111111111111111, "step": 320}, {"loss": 1.8131, "grad_norm": 0.5592505931854248, "learning_rate": 0.0002, "epoch": 0.7333333333333333, "step": 330}, {"loss": 1.8827, "grad_norm": 0.36154472827911377, "learning_rate": 0.0002, "epoch": 0.7555555555555555, "step": 340}, {"loss": 1.7764, "grad_norm": 0.43970227241516113, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 350}, {"loss": 1.8333, "grad_norm": 0.3525223731994629, "learning_rate": 0.0002, "epoch": 0.8, "step": 360}, {"loss": 1.7802, "grad_norm": 0.3706997036933899, "learning_rate": 0.0002, "epoch": 0.8222222222222222, "step": 370}, {"loss": 1.7937, "grad_norm": 0.34138166904449463, "learning_rate": 0.0002, "epoch": 0.8444444444444444, "step": 380}, {"loss": 1.8251, "grad_norm": 0.4090622365474701, "learning_rate": 0.0002, "epoch": 0.8666666666666667, "step": 390}, {"loss": 1.7685, "grad_norm": 0.3729974031448364, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 400}, {"loss": 1.836, "grad_norm": 0.3742152452468872, "learning_rate": 0.0002, "epoch": 0.9111111111111111, "step": 410}, {"loss": 1.7998, "grad_norm": 0.37685129046440125, "learning_rate": 0.0002, "epoch": 0.9333333333333333, "step": 420}, {"loss": 1.7601, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.9555555555555556, "step": 430}, {"loss": 1.7651, "grad_norm": 0.31139856576919556, "learning_rate": 0.0002, "epoch": 0.9777777777777777, "step": 440}, {"loss": 1.8471, "grad_norm": 0.3577502965927124, "learning_rate": 0.0002, "epoch": 1.0, "step": 450}, {"eval_loss": 1.8310279846191406, "eval_runtime": 38.8374, "eval_samples_per_second": 13.26, "eval_steps_per_second": 1.674, "epoch": 1.0, "step": 450}, {"loss": 1.7741, "grad_norm": 0.3257788419723511, "learning_rate": 0.0002, "epoch": 1.0222222222222221, "step": 460}, {"loss": 1.8031, "grad_norm": 0.3878970146179199, "learning_rate": 0.0002, "epoch": 1.0444444444444445, "step": 470}, {"loss": 1.701, "grad_norm": 0.364427387714386, "learning_rate": 0.0002, "epoch": 1.0666666666666667, "step": 480}, {"loss": 1.7832, "grad_norm": 0.3374682664871216, "learning_rate": 0.0002, "epoch": 1.0888888888888888, "step": 490}, {"loss": 1.7308, "grad_norm": 0.35822123289108276, "learning_rate": 0.0002, "epoch": 1.1111111111111112, "step": 500}, {"loss": 1.7753, "grad_norm": 0.3748345673084259, "learning_rate": 0.0002, "epoch": 1.1333333333333333, "step": 510}, {"loss": 1.7225, "grad_norm": 0.3422437012195587, "learning_rate": 0.0002, "epoch": 1.1555555555555554, "step": 520}, {"loss": 1.7649, "grad_norm": 0.4289326071739197, "learning_rate": 0.0002, "epoch": 1.1777777777777778, "step": 530}, {"loss": 1.7548, "grad_norm": 0.3706769645214081, "learning_rate": 0.0002, "epoch": 1.2, "step": 540}, {"loss": 1.6988, "grad_norm": 0.4024733603000641, "learning_rate": 0.0002, "epoch": 1.2222222222222223, "step": 550}, {"loss": 1.7641, "grad_norm": 0.3960128128528595, "learning_rate": 0.0002, "epoch": 1.2444444444444445, "step": 560}, {"loss": 1.755, "grad_norm": 0.38222864270210266, "learning_rate": 0.0002, "epoch": 1.2666666666666666, "step": 570}, {"loss": 1.735, "grad_norm": 0.4073713421821594, "learning_rate": 0.0002, "epoch": 1.2888888888888888, "step": 580}, {"loss": 1.7183, "grad_norm": 0.3875499963760376, "learning_rate": 0.0002, "epoch": 1.3111111111111111, "step": 590}, {"loss": 1.7492, "grad_norm": 0.39740806818008423, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 600}, {"loss": 1.8478, "grad_norm": 0.38432490825653076, "learning_rate": 0.0002, "epoch": 1.3555555555555556, "step": 610}, {"loss": 1.7327, "grad_norm": 0.402729868888855, "learning_rate": 0.0002, "epoch": 1.3777777777777778, "step": 620}, {"loss": 1.6634, "grad_norm": 0.36683231592178345, "learning_rate": 0.0002, "epoch": 1.4, "step": 630}, {"loss": 1.8059, "grad_norm": 0.3883286714553833, "learning_rate": 0.0002, "epoch": 1.4222222222222223, "step": 640}, {"loss": 1.7953, "grad_norm": 0.4087409973144531, "learning_rate": 0.0002, "epoch": 1.4444444444444444, "step": 650}, {"loss": 1.7491, "grad_norm": 0.4042017459869385, "learning_rate": 0.0002, "epoch": 1.4666666666666668, "step": 660}, {"loss": 1.7466, "grad_norm": 0.40149256587028503, "learning_rate": 0.0002, "epoch": 1.488888888888889, "step": 670}, {"loss": 1.7398, "grad_norm": 0.45146510004997253, "learning_rate": 0.0002, "epoch": 1.511111111111111, "step": 680}, {"loss": 1.6923, "grad_norm": 0.4098089039325714, "learning_rate": 0.0002, "epoch": 1.5333333333333332, "step": 690}, {"loss": 1.6847, "grad_norm": 0.4181336760520935, "learning_rate": 0.0002, "epoch": 1.5555555555555556, "step": 700}, {"loss": 1.7862, "grad_norm": 1.3722974061965942, "learning_rate": 0.0002, "epoch": 1.5777777777777777, "step": 710}, {"loss": 1.7424, "grad_norm": 0.3965230882167816, "learning_rate": 0.0002, "epoch": 1.6, "step": 720}, {"loss": 1.7782, "grad_norm": 0.3842000663280487, "learning_rate": 0.0002, "epoch": 1.6222222222222222, "step": 730}, {"loss": 1.7682, "grad_norm": 0.3603688180446625, "learning_rate": 0.0002, "epoch": 1.6444444444444444, "step": 740}, {"loss": 1.7476, "grad_norm": 0.39973509311676025, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 750}, {"loss": 1.7329, "grad_norm": 0.3687385618686676, "learning_rate": 0.0002, "epoch": 1.6888888888888889, "step": 760}, {"loss": 1.7987, "grad_norm": 0.4267722964286804, "learning_rate": 0.0002, "epoch": 1.7111111111111112, "step": 770}, {"loss": 1.8041, "grad_norm": 0.41301295161247253, "learning_rate": 0.0002, "epoch": 1.7333333333333334, "step": 780}, {"loss": 1.7284, "grad_norm": 0.3945430517196655, "learning_rate": 0.0002, "epoch": 1.7555555555555555, "step": 790}, {"loss": 1.6691, "grad_norm": 0.4037930965423584, "learning_rate": 0.0002, "epoch": 1.7777777777777777, "step": 800}, {"loss": 1.6874, "grad_norm": 0.406893253326416, "learning_rate": 0.0002, "epoch": 1.8, "step": 810}, {"loss": 1.7628, "grad_norm": 0.4600457549095154, "learning_rate": 0.0002, "epoch": 1.8222222222222222, "step": 820}, {"loss": 1.8222, "grad_norm": 0.4195384085178375, "learning_rate": 0.0002, "epoch": 1.8444444444444446, "step": 830}, {"loss": 1.7123, "grad_norm": 0.3854130506515503, "learning_rate": 0.0002, "epoch": 1.8666666666666667, "step": 840}, {"loss": 1.7227, "grad_norm": 0.38279038667678833, "learning_rate": 0.0002, "epoch": 1.8888888888888888, "step": 850}, {"loss": 1.6942, "grad_norm": 0.38249439001083374, "learning_rate": 0.0002, "epoch": 1.911111111111111, "step": 860}, {"loss": 1.8058, "grad_norm": 0.42977792024612427, "learning_rate": 0.0002, "epoch": 1.9333333333333333, "step": 870}, {"loss": 1.713, "grad_norm": 0.4109351933002472, "learning_rate": 0.0002, "epoch": 1.9555555555555557, "step": 880}, {"loss": 1.7035, "grad_norm": 0.3734486699104309, "learning_rate": 0.0002, "epoch": 1.9777777777777779, "step": 890}, {"loss": 1.7502, "grad_norm": 0.3603087067604065, "learning_rate": 0.0002, "epoch": 2.0, "step": 900}, {"eval_loss": 1.8250652551651, "eval_runtime": 38.8657, "eval_samples_per_second": 13.251, "eval_steps_per_second": 1.672, "epoch": 2.0, "step": 900}, {"loss": 1.6063, "grad_norm": 0.4014144241809845, "learning_rate": 0.0002, "epoch": 2.022222222222222, "step": 910}, {"loss": 1.6604, "grad_norm": 0.4338063597679138, "learning_rate": 0.0002, "epoch": 2.0444444444444443, "step": 920}, {"loss": 1.6354, "grad_norm": 0.3693605065345764, "learning_rate": 0.0002, "epoch": 2.066666666666667, "step": 930}, {"loss": 1.6186, "grad_norm": 0.4040255844593048, "learning_rate": 0.0002, "epoch": 2.088888888888889, "step": 940}, {"loss": 1.5976, "grad_norm": 0.43481820821762085, "learning_rate": 0.0002, "epoch": 2.111111111111111, "step": 950}, {"loss": 1.6081, "grad_norm": 0.41632869839668274, "learning_rate": 0.0002, "epoch": 2.1333333333333333, "step": 960}, {"loss": 1.6586, "grad_norm": 0.4633755385875702, "learning_rate": 0.0002, "epoch": 2.1555555555555554, "step": 970}, {"loss": 1.7396, "grad_norm": 0.43926581740379333, "learning_rate": 0.0002, "epoch": 2.1777777777777776, "step": 980}, {"loss": 1.5909, "grad_norm": 0.4757233262062073, "learning_rate": 0.0002, "epoch": 2.2, "step": 990}, {"loss": 1.648, "grad_norm": 0.5010586977005005, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 1000}, {"loss": 1.6454, "grad_norm": 0.44900986552238464, "learning_rate": 0.0002, "epoch": 2.2444444444444445, "step": 1010}, {"loss": 1.6493, "grad_norm": 0.41274750232696533, "learning_rate": 0.0002, "epoch": 2.2666666666666666, "step": 1020}, {"loss": 1.6592, "grad_norm": 0.44672393798828125, "learning_rate": 0.0002, "epoch": 2.2888888888888888, "step": 1030}, {"loss": 1.6349, "grad_norm": 0.4826269745826721, "learning_rate": 0.0002, "epoch": 2.311111111111111, "step": 1040}, {"loss": 1.5988, "grad_norm": 0.4650685489177704, "learning_rate": 0.0002, "epoch": 2.3333333333333335, "step": 1050}, {"loss": 1.5859, "grad_norm": 0.42507848143577576, "learning_rate": 0.0002, "epoch": 2.3555555555555556, "step": 1060}, {"loss": 1.5932, "grad_norm": 0.45653030276298523, "learning_rate": 0.0002, "epoch": 2.3777777777777778, "step": 1070}, {"loss": 1.6469, "grad_norm": 0.44534122943878174, "learning_rate": 0.0002, "epoch": 2.4, "step": 1080}, {"loss": 1.6115, "grad_norm": 0.4241289794445038, "learning_rate": 0.0002, "epoch": 2.422222222222222, "step": 1090}, {"loss": 1.6935, "grad_norm": 0.5004808306694031, "learning_rate": 0.0002, "epoch": 2.4444444444444446, "step": 1100}, {"loss": 1.6833, "grad_norm": 0.41425490379333496, "learning_rate": 0.0002, "epoch": 2.466666666666667, "step": 1110}, {"loss": 1.6151, "grad_norm": 0.44362279772758484, "learning_rate": 0.0002, "epoch": 2.488888888888889, "step": 1120}, {"loss": 1.6394, "grad_norm": 0.5530985593795776, "learning_rate": 0.0002, "epoch": 2.511111111111111, "step": 1130}, {"loss": 1.64, "grad_norm": 0.4290637969970703, "learning_rate": 0.0002, "epoch": 2.533333333333333, "step": 1140}, {"loss": 1.76, "grad_norm": 0.4957487881183624, "learning_rate": 0.0002, "epoch": 2.5555555555555554, "step": 1150}, {"loss": 1.613, "grad_norm": 0.5082747340202332, "learning_rate": 0.0002, "epoch": 2.5777777777777775, "step": 1160}, {"loss": 1.6702, "grad_norm": 0.478722482919693, "learning_rate": 0.0002, "epoch": 2.6, "step": 1170}, {"loss": 1.6198, "grad_norm": 0.436454176902771, "learning_rate": 0.0002, "epoch": 2.6222222222222222, "step": 1180}, {"loss": 1.663, "grad_norm": 0.4905032515525818, "learning_rate": 0.0002, "epoch": 2.6444444444444444, "step": 1190}, {"loss": 1.6216, "grad_norm": 0.4815700054168701, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 1200}, {"loss": 1.6994, "grad_norm": 0.3965534269809723, "learning_rate": 0.0002, "epoch": 2.688888888888889, "step": 1210}, {"loss": 1.744, "grad_norm": 0.43282169103622437, "learning_rate": 0.0002, "epoch": 2.7111111111111112, "step": 1220}, {"loss": 1.6455, "grad_norm": 0.45512479543685913, "learning_rate": 0.0002, "epoch": 2.7333333333333334, "step": 1230}, {"loss": 1.6444, "grad_norm": 0.44370076060295105, "learning_rate": 0.0002, "epoch": 2.7555555555555555, "step": 1240}, {"loss": 1.7106, "grad_norm": 0.4750686287879944, "learning_rate": 0.0002, "epoch": 2.7777777777777777, "step": 1250}, {"loss": 1.7122, "grad_norm": 0.41953766345977783, "learning_rate": 0.0002, "epoch": 2.8, "step": 1260}, {"loss": 1.6203, "grad_norm": 0.4887140095233917, "learning_rate": 0.0002, "epoch": 2.822222222222222, "step": 1270}, {"loss": 1.6691, "grad_norm": 0.46718958020210266, "learning_rate": 0.0002, "epoch": 2.8444444444444446, "step": 1280}, {"loss": 1.6257, "grad_norm": 0.48510900139808655, "learning_rate": 0.0002, "epoch": 2.8666666666666667, "step": 1290}, {"loss": 1.6825, "grad_norm": 0.4504084289073944, "learning_rate": 0.0002, "epoch": 2.888888888888889, "step": 1300}, {"loss": 1.7453, "grad_norm": 0.42119622230529785, "learning_rate": 0.0002, "epoch": 2.911111111111111, "step": 1310}, {"loss": 1.6662, "grad_norm": 0.4763694107532501, "learning_rate": 0.0002, "epoch": 2.9333333333333336, "step": 1320}, {"loss": 1.6151, "grad_norm": 0.422810822725296, "learning_rate": 0.0002, "epoch": 2.9555555555555557, "step": 1330}, {"loss": 1.6578, "grad_norm": 0.4768871068954468, "learning_rate": 0.0002, "epoch": 2.977777777777778, "step": 1340}, {"loss": 1.6167, "grad_norm": 0.48259881138801575, "learning_rate": 0.0002, "epoch": 3.0, "step": 1350}, {"eval_loss": 1.8452560901641846, "eval_runtime": 38.8621, "eval_samples_per_second": 13.252, "eval_steps_per_second": 1.673, "epoch": 3.0, "step": 1350}, {"loss": 1.5351, "grad_norm": 0.6933313012123108, "learning_rate": 0.0002, "epoch": 3.022222222222222, "step": 1360}, {"loss": 1.5542, "grad_norm": 0.5870710611343384, "learning_rate": 0.0002, "epoch": 3.0444444444444443, "step": 1370}, {"loss": 1.511, "grad_norm": 0.602210283279419, "learning_rate": 0.0002, "epoch": 3.066666666666667, "step": 1380}, {"loss": 1.5272, "grad_norm": 0.6461787819862366, "learning_rate": 0.0002, "epoch": 3.088888888888889, "step": 1390}, {"loss": 1.4813, "grad_norm": 0.5839587450027466, "learning_rate": 0.0002, "epoch": 3.111111111111111, "step": 1400}, {"loss": 1.505, "grad_norm": 0.5757876038551331, "learning_rate": 0.0002, "epoch": 3.1333333333333333, "step": 1410}, {"loss": 1.4963, "grad_norm": 0.5862616300582886, "learning_rate": 0.0002, "epoch": 3.1555555555555554, "step": 1420}, {"loss": 1.5144, "grad_norm": 0.6103630065917969, "learning_rate": 0.0002, "epoch": 3.1777777777777776, "step": 1430}, {"loss": 1.5406, "grad_norm": 0.9309254884719849, "learning_rate": 0.0002, "epoch": 3.2, "step": 1440}, {"loss": 1.487, "grad_norm": 0.5360018014907837, "learning_rate": 0.0002, "epoch": 3.2222222222222223, "step": 1450}, {"loss": 1.5659, "grad_norm": 0.5448758602142334, "learning_rate": 0.0002, "epoch": 3.2444444444444445, "step": 1460}, {"loss": 1.5595, "grad_norm": 0.5973812341690063, "learning_rate": 0.0002, "epoch": 3.2666666666666666, "step": 1470}, {"loss": 1.5223, "grad_norm": 0.6245622038841248, "learning_rate": 0.0002, "epoch": 3.2888888888888888, "step": 1480}, {"loss": 1.4795, "grad_norm": 0.6533768773078918, "learning_rate": 0.0002, "epoch": 3.311111111111111, "step": 1490}, {"loss": 1.5562, "grad_norm": 0.5765811204910278, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 1500}, {"loss": 1.5405, "grad_norm": 0.591395378112793, "learning_rate": 0.0002, "epoch": 3.3555555555555556, "step": 1510}, {"loss": 1.5658, "grad_norm": 0.5842425227165222, "learning_rate": 0.0002, "epoch": 3.3777777777777778, "step": 1520}, {"loss": 1.5065, "grad_norm": 0.5731365084648132, "learning_rate": 0.0002, "epoch": 3.4, "step": 1530}, {"loss": 1.5438, "grad_norm": 0.5841306447982788, "learning_rate": 0.0002, "epoch": 3.422222222222222, "step": 1540}, {"loss": 1.4922, "grad_norm": 0.6503536701202393, "learning_rate": 0.0002, "epoch": 3.4444444444444446, "step": 1550}, {"loss": 1.5493, "grad_norm": 0.6170967221260071, "learning_rate": 0.0002, "epoch": 3.466666666666667, "step": 1560}, {"loss": 1.5098, "grad_norm": 0.5576487183570862, "learning_rate": 0.0002, "epoch": 3.488888888888889, "step": 1570}, {"loss": 1.472, "grad_norm": 0.7082911133766174, "learning_rate": 0.0002, "epoch": 3.511111111111111, "step": 1580}, {"loss": 1.5594, "grad_norm": 0.6159376502037048, "learning_rate": 0.0002, "epoch": 3.533333333333333, "step": 1590}, {"loss": 1.563, "grad_norm": 0.5972959399223328, "learning_rate": 0.0002, "epoch": 3.5555555555555554, "step": 1600}, {"loss": 1.4876, "grad_norm": 0.5787310004234314, "learning_rate": 0.0002, "epoch": 3.5777777777777775, "step": 1610}, {"loss": 1.4887, "grad_norm": 0.5846341252326965, "learning_rate": 0.0002, "epoch": 3.6, "step": 1620}, {"loss": 1.542, "grad_norm": 0.5906197428703308, "learning_rate": 0.0002, "epoch": 3.6222222222222222, "step": 1630}, {"loss": 1.4941, "grad_norm": 0.6305760145187378, "learning_rate": 0.0002, "epoch": 3.6444444444444444, "step": 1640}, {"loss": 1.4677, "grad_norm": 0.7448979616165161, "learning_rate": 0.0002, "epoch": 3.6666666666666665, "step": 1650}, {"loss": 1.5961, "grad_norm": 0.5906165242195129, "learning_rate": 0.0002, "epoch": 3.688888888888889, "step": 1660}, {"loss": 1.4882, "grad_norm": 0.605032742023468, "learning_rate": 0.0002, "epoch": 3.7111111111111112, "step": 1670}, {"loss": 1.5804, "grad_norm": 0.6117229461669922, "learning_rate": 0.0002, "epoch": 3.7333333333333334, "step": 1680}, {"loss": 1.5131, "grad_norm": 0.613581120967865, "learning_rate": 0.0002, "epoch": 3.7555555555555555, "step": 1690}, {"loss": 1.5074, "grad_norm": 0.6244436502456665, "learning_rate": 0.0002, "epoch": 3.7777777777777777, "step": 1700}, {"loss": 1.5738, "grad_norm": 0.6236702799797058, "learning_rate": 0.0002, "epoch": 3.8, "step": 1710}, {"loss": 1.6542, "grad_norm": 0.639141857624054, "learning_rate": 0.0002, "epoch": 3.822222222222222, "step": 1720}, {"loss": 1.536, "grad_norm": 0.5782344937324524, "learning_rate": 0.0002, "epoch": 3.8444444444444446, "step": 1730}, {"loss": 1.5355, "grad_norm": 0.5952938795089722, "learning_rate": 0.0002, "epoch": 3.8666666666666667, "step": 1740}, {"loss": 1.5205, "grad_norm": 0.5573042035102844, "learning_rate": 0.0002, "epoch": 3.888888888888889, "step": 1750}, {"loss": 1.5066, "grad_norm": 0.6114351749420166, "learning_rate": 0.0002, "epoch": 3.911111111111111, "step": 1760}, {"loss": 1.5706, "grad_norm": 0.5973817110061646, "learning_rate": 0.0002, "epoch": 3.9333333333333336, "step": 1770}, {"loss": 1.5003, "grad_norm": 0.602317750453949, "learning_rate": 0.0002, "epoch": 3.9555555555555557, "step": 1780}, {"loss": 1.5022, "grad_norm": 0.5965437293052673, "learning_rate": 0.0002, "epoch": 3.977777777777778, "step": 1790}, {"loss": 1.5031, "grad_norm": 0.5641552209854126, "learning_rate": 0.0002, "epoch": 4.0, "step": 1800}, {"eval_loss": 1.892098069190979, "eval_runtime": 38.8755, "eval_samples_per_second": 13.247, "eval_steps_per_second": 1.672, "epoch": 4.0, "step": 1800}, {"loss": 1.3894, "grad_norm": 0.8302594423294067, "learning_rate": 0.0002, "epoch": 4.022222222222222, "step": 1810}, {"loss": 1.3727, "grad_norm": 0.6695230603218079, "learning_rate": 0.0002, "epoch": 4.044444444444444, "step": 1820}, {"loss": 1.3064, "grad_norm": 0.7911471128463745, "learning_rate": 0.0002, "epoch": 4.066666666666666, "step": 1830}, {"loss": 1.4574, "grad_norm": 0.7044888138771057, "learning_rate": 0.0002, "epoch": 4.088888888888889, "step": 1840}, {"loss": 1.3941, "grad_norm": 0.7057249546051025, "learning_rate": 0.0002, "epoch": 4.111111111111111, "step": 1850}, {"loss": 1.4052, "grad_norm": 0.8762815594673157, "learning_rate": 0.0002, "epoch": 4.133333333333334, "step": 1860}, {"loss": 1.3784, "grad_norm": 0.7619158029556274, "learning_rate": 0.0002, "epoch": 4.155555555555556, "step": 1870}, {"loss": 1.3581, "grad_norm": 0.7711658477783203, "learning_rate": 0.0002, "epoch": 4.177777777777778, "step": 1880}, {"loss": 1.3995, "grad_norm": 0.9732598662376404, "learning_rate": 0.0002, "epoch": 4.2, "step": 1890}, {"loss": 1.3353, "grad_norm": 0.9070265889167786, "learning_rate": 0.0002, "epoch": 4.222222222222222, "step": 1900}, {"loss": 1.3947, "grad_norm": 0.8274767994880676, "learning_rate": 0.0002, "epoch": 4.2444444444444445, "step": 1910}, {"loss": 1.3392, "grad_norm": 0.8514227271080017, "learning_rate": 0.0002, "epoch": 4.266666666666667, "step": 1920}, {"loss": 1.3492, "grad_norm": 0.7356534600257874, "learning_rate": 0.0002, "epoch": 4.288888888888889, "step": 1930}, {"loss": 1.3708, "grad_norm": 0.8226608037948608, "learning_rate": 0.0002, "epoch": 4.311111111111111, "step": 1940}, {"loss": 1.3652, "grad_norm": 0.8347907066345215, "learning_rate": 0.0002, "epoch": 4.333333333333333, "step": 1950}, {"loss": 1.3415, "grad_norm": 0.8509323000907898, "learning_rate": 0.0002, "epoch": 4.355555555555555, "step": 1960}, {"loss": 1.3796, "grad_norm": 0.8776063323020935, "learning_rate": 0.0002, "epoch": 4.377777777777778, "step": 1970}, {"loss": 1.438, "grad_norm": 0.8022271990776062, "learning_rate": 0.0002, "epoch": 4.4, "step": 1980}, {"loss": 1.3671, "grad_norm": 0.7984752058982849, "learning_rate": 0.0002, "epoch": 4.4222222222222225, "step": 1990}, {"loss": 1.4214, "grad_norm": 0.7349720001220703, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 2000}, {"loss": 1.4174, "grad_norm": 0.7778817415237427, "learning_rate": 0.0002, "epoch": 4.466666666666667, "step": 2010}, {"loss": 1.3365, "grad_norm": 0.9361467361450195, "learning_rate": 0.0002, "epoch": 4.488888888888889, "step": 2020}, {"loss": 1.4129, "grad_norm": 0.7839348912239075, "learning_rate": 0.0002, "epoch": 4.511111111111111, "step": 2030}, {"loss": 1.3761, "grad_norm": 0.8361981511116028, "learning_rate": 0.0002, "epoch": 4.533333333333333, "step": 2040}, {"loss": 1.4085, "grad_norm": 1.9877147674560547, "learning_rate": 0.0002, "epoch": 4.555555555555555, "step": 2050}, {"loss": 1.329, "grad_norm": 0.7506140470504761, "learning_rate": 0.0002, "epoch": 4.5777777777777775, "step": 2060}, {"loss": 1.3557, "grad_norm": 0.9493570327758789, "learning_rate": 0.0002, "epoch": 4.6, "step": 2070}, {"loss": 1.438, "grad_norm": 0.7198925018310547, "learning_rate": 0.0002, "epoch": 4.622222222222222, "step": 2080}, {"loss": 1.3892, "grad_norm": 0.7521472573280334, "learning_rate": 0.0002, "epoch": 4.644444444444445, "step": 2090}, {"loss": 1.3833, "grad_norm": 0.766718327999115, "learning_rate": 0.0002, "epoch": 4.666666666666667, "step": 2100}, {"loss": 1.3541, "grad_norm": 0.9162390232086182, "learning_rate": 0.0002, "epoch": 4.688888888888889, "step": 2110}, {"loss": 1.4603, "grad_norm": 0.8980328440666199, "learning_rate": 0.0002, "epoch": 4.711111111111111, "step": 2120}, {"loss": 1.4043, "grad_norm": 0.8109711408615112, "learning_rate": 0.0002, "epoch": 4.733333333333333, "step": 2130}, {"loss": 1.373, "grad_norm": 0.7372606992721558, "learning_rate": 0.0002, "epoch": 4.7555555555555555, "step": 2140}, {"loss": 1.4439, "grad_norm": 0.7527457475662231, "learning_rate": 0.0002, "epoch": 4.777777777777778, "step": 2150}, {"loss": 1.2999, "grad_norm": 1.0380001068115234, "learning_rate": 0.0002, "epoch": 4.8, "step": 2160}, {"loss": 1.3562, "grad_norm": 0.7166368365287781, "learning_rate": 0.0002, "epoch": 4.822222222222222, "step": 2170}, {"loss": 1.3917, "grad_norm": 0.784548282623291, "learning_rate": 0.0002, "epoch": 4.844444444444444, "step": 2180}, {"loss": 1.3376, "grad_norm": 0.7771317958831787, "learning_rate": 0.0002, "epoch": 4.866666666666667, "step": 2190}, {"loss": 1.3315, "grad_norm": 0.7710300087928772, "learning_rate": 0.0002, "epoch": 4.888888888888889, "step": 2200}, {"loss": 1.3676, "grad_norm": 0.7715084552764893, "learning_rate": 0.0002, "epoch": 4.911111111111111, "step": 2210}, {"loss": 1.5352, "grad_norm": 0.7888006567955017, "learning_rate": 0.0002, "epoch": 4.933333333333334, "step": 2220}, {"loss": 1.4139, "grad_norm": 0.800684928894043, "learning_rate": 0.0002, "epoch": 4.955555555555556, "step": 2230}, {"loss": 1.4343, "grad_norm": 0.7710039019584656, "learning_rate": 0.0002, "epoch": 4.977777777777778, "step": 2240}, {"loss": 1.3501, "grad_norm": 0.8617033958435059, "learning_rate": 0.0002, "epoch": 5.0, "step": 2250}, {"eval_loss": 1.9718151092529297, "eval_runtime": 38.8999, "eval_samples_per_second": 13.239, "eval_steps_per_second": 1.671, "epoch": 5.0, "step": 2250}, {"loss": 1.19, "grad_norm": 1.07399582862854, "learning_rate": 0.0002, "epoch": 5.022222222222222, "step": 2260}, {"loss": 1.2299, "grad_norm": 0.6598460674285889, "learning_rate": 0.0002, "epoch": 5.044444444444444, "step": 2270}, {"loss": 1.2333, "grad_norm": 1.1039506196975708, "learning_rate": 0.0002, "epoch": 5.066666666666666, "step": 2280}, {"loss": 1.2412, "grad_norm": 1.0624054670333862, "learning_rate": 0.0002, "epoch": 5.088888888888889, "step": 2290}, {"loss": 1.184, "grad_norm": 0.849583625793457, "learning_rate": 0.0002, "epoch": 5.111111111111111, "step": 2300}, {"loss": 1.1884, "grad_norm": 1.0143699645996094, "learning_rate": 0.0002, "epoch": 5.133333333333334, "step": 2310}, {"loss": 1.2133, "grad_norm": 0.8990702629089355, "learning_rate": 0.0002, "epoch": 5.155555555555556, "step": 2320}, {"loss": 1.2091, "grad_norm": 0.9822764992713928, "learning_rate": 0.0002, "epoch": 5.177777777777778, "step": 2330}, {"loss": 1.1775, "grad_norm": 0.9632459282875061, "learning_rate": 0.0002, "epoch": 5.2, "step": 2340}, {"loss": 1.1821, "grad_norm": 1.0897903442382812, "learning_rate": 0.0002, "epoch": 5.222222222222222, "step": 2350}, {"loss": 1.2976, "grad_norm": 1.155950665473938, "learning_rate": 0.0002, "epoch": 5.2444444444444445, "step": 2360}, {"loss": 1.1662, "grad_norm": 1.0566821098327637, "learning_rate": 0.0002, "epoch": 5.266666666666667, "step": 2370}, {"loss": 1.2809, "grad_norm": 1.191604733467102, "learning_rate": 0.0002, "epoch": 5.288888888888889, "step": 2380}, {"loss": 1.2431, "grad_norm": 0.852453887462616, "learning_rate": 0.0002, "epoch": 5.311111111111111, "step": 2390}, {"loss": 1.2106, "grad_norm": 0.9649669528007507, "learning_rate": 0.0002, "epoch": 5.333333333333333, "step": 2400}, {"loss": 1.2433, "grad_norm": 1.0731003284454346, "learning_rate": 0.0002, "epoch": 5.355555555555555, "step": 2410}, {"loss": 1.1737, "grad_norm": 0.9628495573997498, "learning_rate": 0.0002, "epoch": 5.377777777777778, "step": 2420}, {"loss": 1.3166, "grad_norm": 0.9268819093704224, "learning_rate": 0.0002, "epoch": 5.4, "step": 2430}, {"loss": 1.2114, "grad_norm": 1.1104000806808472, "learning_rate": 0.0002, "epoch": 5.4222222222222225, "step": 2440}, {"loss": 1.2151, "grad_norm": 1.0439373254776, "learning_rate": 0.0002, "epoch": 5.444444444444445, "step": 2450}, {"loss": 1.2458, "grad_norm": 1.0366657972335815, "learning_rate": 0.0002, "epoch": 5.466666666666667, "step": 2460}, {"loss": 1.2021, "grad_norm": 1.0604808330535889, "learning_rate": 0.0002, "epoch": 5.488888888888889, "step": 2470}, {"loss": 1.2188, "grad_norm": 0.8845253586769104, "learning_rate": 0.0002, "epoch": 5.511111111111111, "step": 2480}, {"loss": 1.2296, "grad_norm": 0.8200256824493408, "learning_rate": 0.0002, "epoch": 5.533333333333333, "step": 2490}, {"loss": 1.2632, "grad_norm": 0.9628723859786987, "learning_rate": 0.0002, "epoch": 5.555555555555555, "step": 2500}, {"loss": 1.2723, "grad_norm": 1.0758650302886963, "learning_rate": 0.0002, "epoch": 5.5777777777777775, "step": 2510}, {"loss": 1.2298, "grad_norm": 1.0113487243652344, "learning_rate": 0.0002, "epoch": 5.6, "step": 2520}, {"loss": 1.2226, "grad_norm": 1.260536551475525, "learning_rate": 0.0002, "epoch": 5.622222222222222, "step": 2530}, {"loss": 1.227, "grad_norm": 0.9229527115821838, "learning_rate": 0.0002, "epoch": 5.644444444444445, "step": 2540}, {"loss": 1.2223, "grad_norm": 0.9378697276115417, "learning_rate": 0.0002, "epoch": 5.666666666666667, "step": 2550}, {"loss": 1.2759, "grad_norm": 1.0404350757598877, "learning_rate": 0.0002, "epoch": 5.688888888888889, "step": 2560}, {"loss": 1.2132, "grad_norm": 1.1879961490631104, "learning_rate": 0.0002, "epoch": 5.711111111111111, "step": 2570}, {"loss": 1.2181, "grad_norm": 0.8881482481956482, "learning_rate": 0.0002, "epoch": 5.733333333333333, "step": 2580}, {"loss": 1.2419, "grad_norm": 1.1428065299987793, "learning_rate": 0.0002, "epoch": 5.7555555555555555, "step": 2590}, {"loss": 1.2682, "grad_norm": 0.8970609903335571, "learning_rate": 0.0002, "epoch": 5.777777777777778, "step": 2600}, {"loss": 1.2285, "grad_norm": 1.2084497213363647, "learning_rate": 0.0002, "epoch": 5.8, "step": 2610}, {"loss": 1.2004, "grad_norm": 1.04214608669281, "learning_rate": 0.0002, "epoch": 5.822222222222222, "step": 2620}, {"loss": 1.2388, "grad_norm": 1.0671849250793457, "learning_rate": 0.0002, "epoch": 5.844444444444444, "step": 2630}, {"loss": 1.1714, "grad_norm": 1.009602427482605, "learning_rate": 0.0002, "epoch": 5.866666666666667, "step": 2640}, {"loss": 1.2292, "grad_norm": 0.9787904024124146, "learning_rate": 0.0002, "epoch": 5.888888888888889, "step": 2650}, {"loss": 1.2404, "grad_norm": 1.0043761730194092, "learning_rate": 0.0002, "epoch": 5.911111111111111, "step": 2660}, {"loss": 1.2712, "grad_norm": 0.9855443239212036, "learning_rate": 0.0002, "epoch": 5.933333333333334, "step": 2670}, {"loss": 1.3112, "grad_norm": 1.1488507986068726, "learning_rate": 0.0002, "epoch": 5.955555555555556, "step": 2680}, {"loss": 1.2576, "grad_norm": 0.9939966797828674, "learning_rate": 0.0002, "epoch": 5.977777777777778, "step": 2690}, {"loss": 1.2847, "grad_norm": 1.0444952249526978, "learning_rate": 0.0002, "epoch": 6.0, "step": 2700}, {"eval_loss": 2.0881619453430176, "eval_runtime": 39.6891, "eval_samples_per_second": 12.976, "eval_steps_per_second": 1.638, "epoch": 6.0, "step": 2700}, {"loss": 1.0764, "grad_norm": 1.3728636503219604, "learning_rate": 0.0002, "epoch": 6.022222222222222, "step": 2710}, {"loss": 1.0778, "grad_norm": 1.06633460521698, "learning_rate": 0.0002, "epoch": 6.044444444444444, "step": 2720}, {"loss": 1.0181, "grad_norm": 1.2068440914154053, "learning_rate": 0.0002, "epoch": 6.066666666666666, "step": 2730}, {"loss": 1.0225, "grad_norm": 1.248744010925293, "learning_rate": 0.0002, "epoch": 6.088888888888889, "step": 2740}, {"loss": 1.0885, "grad_norm": 1.1814687252044678, "learning_rate": 0.0002, "epoch": 6.111111111111111, "step": 2750}, {"loss": 0.973, "grad_norm": 1.2335790395736694, "learning_rate": 0.0002, "epoch": 6.133333333333334, "step": 2760}, {"loss": 1.0193, "grad_norm": 1.0661171674728394, "learning_rate": 0.0002, "epoch": 6.155555555555556, "step": 2770}, {"loss": 1.0496, "grad_norm": 1.345876932144165, "learning_rate": 0.0002, "epoch": 6.177777777777778, "step": 2780}, {"loss": 1.0252, "grad_norm": 1.2426252365112305, "learning_rate": 0.0002, "epoch": 6.2, "step": 2790}, {"loss": 1.0075, "grad_norm": 1.1970592737197876, "learning_rate": 0.0002, "epoch": 6.222222222222222, "step": 2800}, {"loss": 1.1016, "grad_norm": 1.2484612464904785, "learning_rate": 0.0002, "epoch": 6.2444444444444445, "step": 2810}, {"loss": 1.0032, "grad_norm": 1.2115106582641602, "learning_rate": 0.0002, "epoch": 6.266666666666667, "step": 2820}, {"loss": 1.0721, "grad_norm": 1.0024933815002441, "learning_rate": 0.0002, "epoch": 6.288888888888889, "step": 2830}, {"loss": 1.0705, "grad_norm": 1.1508114337921143, "learning_rate": 0.0002, "epoch": 6.311111111111111, "step": 2840}, {"loss": 1.0632, "grad_norm": 1.1686254739761353, "learning_rate": 0.0002, "epoch": 6.333333333333333, "step": 2850}, {"loss": 1.1031, "grad_norm": 1.2702640295028687, "learning_rate": 0.0002, "epoch": 6.355555555555555, "step": 2860}, {"loss": 1.1033, "grad_norm": 1.3344615697860718, "learning_rate": 0.0002, "epoch": 6.377777777777778, "step": 2870}, {"loss": 1.1105, "grad_norm": 1.27545964717865, "learning_rate": 0.0002, "epoch": 6.4, "step": 2880}, {"loss": 1.0353, "grad_norm": 1.2365739345550537, "learning_rate": 0.0002, "epoch": 6.4222222222222225, "step": 2890}, {"loss": 1.046, "grad_norm": 1.3821545839309692, "learning_rate": 0.0002, "epoch": 6.444444444444445, "step": 2900}, {"loss": 1.0643, "grad_norm": 1.1889359951019287, "learning_rate": 0.0002, "epoch": 6.466666666666667, "step": 2910}, {"loss": 1.0173, "grad_norm": 1.1324981451034546, "learning_rate": 0.0002, "epoch": 6.488888888888889, "step": 2920}, {"loss": 1.0474, "grad_norm": 1.154468297958374, "learning_rate": 0.0002, "epoch": 6.511111111111111, "step": 2930}, {"loss": 1.1323, "grad_norm": 1.211300253868103, "learning_rate": 0.0002, "epoch": 6.533333333333333, "step": 2940}, {"loss": 1.0901, "grad_norm": 1.3322433233261108, "learning_rate": 0.0002, "epoch": 6.555555555555555, "step": 2950}, {"loss": 1.0636, "grad_norm": 1.2570568323135376, "learning_rate": 0.0002, "epoch": 6.5777777777777775, "step": 2960}, {"loss": 1.1093, "grad_norm": 1.2037729024887085, "learning_rate": 0.0002, "epoch": 6.6, "step": 2970}, {"loss": 1.0355, "grad_norm": 1.2894154787063599, "learning_rate": 0.0002, "epoch": 6.622222222222222, "step": 2980}, {"loss": 0.9846, "grad_norm": 1.1682062149047852, "learning_rate": 0.0002, "epoch": 6.644444444444445, "step": 2990}, {"loss": 1.1292, "grad_norm": 1.6112759113311768, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 3000}, {"loss": 1.1831, "grad_norm": 1.227586269378662, "learning_rate": 0.0002, "epoch": 6.688888888888889, "step": 3010}, {"loss": 1.1656, "grad_norm": 1.2558735609054565, "learning_rate": 0.0002, "epoch": 6.711111111111111, "step": 3020}, {"loss": 1.1151, "grad_norm": 1.2739307880401611, "learning_rate": 0.0002, "epoch": 6.733333333333333, "step": 3030}, {"loss": 1.0957, "grad_norm": 1.2761014699935913, "learning_rate": 0.0002, "epoch": 6.7555555555555555, "step": 3040}, {"loss": 1.0863, "grad_norm": 1.308904767036438, "learning_rate": 0.0002, "epoch": 6.777777777777778, "step": 3050}, {"loss": 1.1072, "grad_norm": 1.6273704767227173, "learning_rate": 0.0002, "epoch": 6.8, "step": 3060}, {"loss": 1.0982, "grad_norm": 1.3006200790405273, "learning_rate": 0.0002, "epoch": 6.822222222222222, "step": 3070}, {"loss": 1.091, "grad_norm": 1.2942757606506348, "learning_rate": 0.0002, "epoch": 6.844444444444444, "step": 3080}, {"loss": 1.0371, "grad_norm": 1.3074650764465332, "learning_rate": 0.0002, "epoch": 6.866666666666667, "step": 3090}, {"loss": 1.0782, "grad_norm": 1.321811556816101, "learning_rate": 0.0002, "epoch": 6.888888888888889, "step": 3100}, {"loss": 1.1375, "grad_norm": 1.0926110744476318, "learning_rate": 0.0002, "epoch": 6.911111111111111, "step": 3110}, {"loss": 1.0966, "grad_norm": 1.3839191198349, "learning_rate": 0.0002, "epoch": 6.933333333333334, "step": 3120}, {"loss": 1.111, "grad_norm": 1.084396481513977, "learning_rate": 0.0002, "epoch": 6.955555555555556, "step": 3130}, {"loss": 1.0947, "grad_norm": 1.262983798980713, "learning_rate": 0.0002, "epoch": 6.977777777777778, "step": 3140}, {"loss": 1.099, "grad_norm": 1.1751209497451782, "learning_rate": 0.0002, "epoch": 7.0, "step": 3150}]} +{"epoch": 8.0, "step": 3600, "epoch_duration": 1178.3437876701355, "total_accumulated_duration": 4965.196889162064, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-0/checkpoint-900", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5874, "grad_norm": 0.5109436511993408, "learning_rate": 0.0002, "epoch": 0.022222222222222223, "step": 10}, {"loss": 2.2952, "grad_norm": 0.4870035946369171, "learning_rate": 0.0002, "epoch": 0.044444444444444446, "step": 20}, {"loss": 2.0554, "grad_norm": 0.535464882850647, "learning_rate": 0.0002, "epoch": 0.06666666666666667, "step": 30}, {"loss": 2.0067, "grad_norm": 0.49077996611595154, "learning_rate": 0.0002, "epoch": 0.08888888888888889, "step": 40}, {"loss": 2.0673, "grad_norm": 0.4671357572078705, "learning_rate": 0.0002, "epoch": 0.1111111111111111, "step": 50}, {"loss": 1.8751, "grad_norm": 0.4970313608646393, "learning_rate": 0.0002, "epoch": 0.13333333333333333, "step": 60}, {"loss": 1.9419, "grad_norm": 0.4438260495662689, "learning_rate": 0.0002, "epoch": 0.15555555555555556, "step": 70}, {"loss": 1.9856, "grad_norm": 0.5089705586433411, "learning_rate": 0.0002, "epoch": 0.17777777777777778, "step": 80}, {"loss": 1.8805, "grad_norm": 0.4645078182220459, "learning_rate": 0.0002, "epoch": 0.2, "step": 90}, {"loss": 1.807, "grad_norm": 0.46095192432403564, "learning_rate": 0.0002, "epoch": 0.2222222222222222, "step": 100}, {"loss": 1.891, "grad_norm": 0.43338075280189514, "learning_rate": 0.0002, "epoch": 0.24444444444444444, "step": 110}, {"loss": 1.84, "grad_norm": 0.4433900713920593, "learning_rate": 0.0002, "epoch": 0.26666666666666666, "step": 120}, {"loss": 1.8895, "grad_norm": 0.7018499970436096, "learning_rate": 0.0002, "epoch": 0.28888888888888886, "step": 130}, {"loss": 1.8273, "grad_norm": 0.37056994438171387, "learning_rate": 0.0002, "epoch": 0.3111111111111111, "step": 140}, {"loss": 1.8059, "grad_norm": 0.40634623169898987, "learning_rate": 0.0002, "epoch": 0.3333333333333333, "step": 150}, {"loss": 1.8393, "grad_norm": 0.41917353868484497, "learning_rate": 0.0002, "epoch": 0.35555555555555557, "step": 160}, {"loss": 1.8658, "grad_norm": 0.42392489314079285, "learning_rate": 0.0002, "epoch": 0.37777777777777777, "step": 170}, {"loss": 1.8617, "grad_norm": 0.4281010627746582, "learning_rate": 0.0002, "epoch": 0.4, "step": 180}, {"loss": 1.8163, "grad_norm": 0.38542497158050537, "learning_rate": 0.0002, "epoch": 0.4222222222222222, "step": 190}, {"loss": 1.8951, "grad_norm": 0.36003032326698303, "learning_rate": 0.0002, "epoch": 0.4444444444444444, "step": 200}, {"loss": 1.8611, "grad_norm": 0.37858229875564575, "learning_rate": 0.0002, "epoch": 0.4666666666666667, "step": 210}, {"loss": 1.8078, "grad_norm": 0.49986031651496887, "learning_rate": 0.0002, "epoch": 0.4888888888888889, "step": 220}, {"loss": 1.7413, "grad_norm": 0.3937094807624817, "learning_rate": 0.0002, "epoch": 0.5111111111111111, "step": 230}, {"loss": 1.8386, "grad_norm": 0.4566134512424469, "learning_rate": 0.0002, "epoch": 0.5333333333333333, "step": 240}, {"loss": 1.8303, "grad_norm": 0.3602476418018341, "learning_rate": 0.0002, "epoch": 0.5555555555555556, "step": 250}, {"loss": 1.7963, "grad_norm": 0.36321184039115906, "learning_rate": 0.0002, "epoch": 0.5777777777777777, "step": 260}, {"loss": 1.9055, "grad_norm": 0.3808199167251587, "learning_rate": 0.0002, "epoch": 0.6, "step": 270}, {"loss": 1.7916, "grad_norm": 0.38910621404647827, "learning_rate": 0.0002, "epoch": 0.6222222222222222, "step": 280}, {"loss": 1.7958, "grad_norm": 0.31913551688194275, "learning_rate": 0.0002, "epoch": 0.6444444444444445, "step": 290}, {"loss": 1.7468, "grad_norm": 0.34734025597572327, "learning_rate": 0.0002, "epoch": 0.6666666666666666, "step": 300}, {"loss": 1.8092, "grad_norm": 0.3517725467681885, "learning_rate": 0.0002, "epoch": 0.6888888888888889, "step": 310}, {"loss": 1.7847, "grad_norm": 0.3804526627063751, "learning_rate": 0.0002, "epoch": 0.7111111111111111, "step": 320}, {"loss": 1.8131, "grad_norm": 0.5592505931854248, "learning_rate": 0.0002, "epoch": 0.7333333333333333, "step": 330}, {"loss": 1.8827, "grad_norm": 0.36154472827911377, "learning_rate": 0.0002, "epoch": 0.7555555555555555, "step": 340}, {"loss": 1.7764, "grad_norm": 0.43970227241516113, "learning_rate": 0.0002, "epoch": 0.7777777777777778, "step": 350}, {"loss": 1.8333, "grad_norm": 0.3525223731994629, "learning_rate": 0.0002, "epoch": 0.8, "step": 360}, {"loss": 1.7802, "grad_norm": 0.3706997036933899, "learning_rate": 0.0002, "epoch": 0.8222222222222222, "step": 370}, {"loss": 1.7937, "grad_norm": 0.34138166904449463, "learning_rate": 0.0002, "epoch": 0.8444444444444444, "step": 380}, {"loss": 1.8251, "grad_norm": 0.4090622365474701, "learning_rate": 0.0002, "epoch": 0.8666666666666667, "step": 390}, {"loss": 1.7685, "grad_norm": 0.3729974031448364, "learning_rate": 0.0002, "epoch": 0.8888888888888888, "step": 400}, {"loss": 1.836, "grad_norm": 0.3742152452468872, "learning_rate": 0.0002, "epoch": 0.9111111111111111, "step": 410}, {"loss": 1.7998, "grad_norm": 0.37685129046440125, "learning_rate": 0.0002, "epoch": 0.9333333333333333, "step": 420}, {"loss": 1.7601, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.9555555555555556, "step": 430}, {"loss": 1.7651, "grad_norm": 0.31139856576919556, "learning_rate": 0.0002, "epoch": 0.9777777777777777, "step": 440}, {"loss": 1.8471, "grad_norm": 0.3577502965927124, "learning_rate": 0.0002, "epoch": 1.0, "step": 450}, {"eval_loss": 1.8310279846191406, "eval_runtime": 38.8374, "eval_samples_per_second": 13.26, "eval_steps_per_second": 1.674, "epoch": 1.0, "step": 450}, {"loss": 1.7741, "grad_norm": 0.3257788419723511, "learning_rate": 0.0002, "epoch": 1.0222222222222221, "step": 460}, {"loss": 1.8031, "grad_norm": 0.3878970146179199, "learning_rate": 0.0002, "epoch": 1.0444444444444445, "step": 470}, {"loss": 1.701, "grad_norm": 0.364427387714386, "learning_rate": 0.0002, "epoch": 1.0666666666666667, "step": 480}, {"loss": 1.7832, "grad_norm": 0.3374682664871216, "learning_rate": 0.0002, "epoch": 1.0888888888888888, "step": 490}, {"loss": 1.7308, "grad_norm": 0.35822123289108276, "learning_rate": 0.0002, "epoch": 1.1111111111111112, "step": 500}, {"loss": 1.7753, "grad_norm": 0.3748345673084259, "learning_rate": 0.0002, "epoch": 1.1333333333333333, "step": 510}, {"loss": 1.7225, "grad_norm": 0.3422437012195587, "learning_rate": 0.0002, "epoch": 1.1555555555555554, "step": 520}, {"loss": 1.7649, "grad_norm": 0.4289326071739197, "learning_rate": 0.0002, "epoch": 1.1777777777777778, "step": 530}, {"loss": 1.7548, "grad_norm": 0.3706769645214081, "learning_rate": 0.0002, "epoch": 1.2, "step": 540}, {"loss": 1.6988, "grad_norm": 0.4024733603000641, "learning_rate": 0.0002, "epoch": 1.2222222222222223, "step": 550}, {"loss": 1.7641, "grad_norm": 0.3960128128528595, "learning_rate": 0.0002, "epoch": 1.2444444444444445, "step": 560}, {"loss": 1.755, "grad_norm": 0.38222864270210266, "learning_rate": 0.0002, "epoch": 1.2666666666666666, "step": 570}, {"loss": 1.735, "grad_norm": 0.4073713421821594, "learning_rate": 0.0002, "epoch": 1.2888888888888888, "step": 580}, {"loss": 1.7183, "grad_norm": 0.3875499963760376, "learning_rate": 0.0002, "epoch": 1.3111111111111111, "step": 590}, {"loss": 1.7492, "grad_norm": 0.39740806818008423, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 600}, {"loss": 1.8478, "grad_norm": 0.38432490825653076, "learning_rate": 0.0002, "epoch": 1.3555555555555556, "step": 610}, {"loss": 1.7327, "grad_norm": 0.402729868888855, "learning_rate": 0.0002, "epoch": 1.3777777777777778, "step": 620}, {"loss": 1.6634, "grad_norm": 0.36683231592178345, "learning_rate": 0.0002, "epoch": 1.4, "step": 630}, {"loss": 1.8059, "grad_norm": 0.3883286714553833, "learning_rate": 0.0002, "epoch": 1.4222222222222223, "step": 640}, {"loss": 1.7953, "grad_norm": 0.4087409973144531, "learning_rate": 0.0002, "epoch": 1.4444444444444444, "step": 650}, {"loss": 1.7491, "grad_norm": 0.4042017459869385, "learning_rate": 0.0002, "epoch": 1.4666666666666668, "step": 660}, {"loss": 1.7466, "grad_norm": 0.40149256587028503, "learning_rate": 0.0002, "epoch": 1.488888888888889, "step": 670}, {"loss": 1.7398, "grad_norm": 0.45146510004997253, "learning_rate": 0.0002, "epoch": 1.511111111111111, "step": 680}, {"loss": 1.6923, "grad_norm": 0.4098089039325714, "learning_rate": 0.0002, "epoch": 1.5333333333333332, "step": 690}, {"loss": 1.6847, "grad_norm": 0.4181336760520935, "learning_rate": 0.0002, "epoch": 1.5555555555555556, "step": 700}, {"loss": 1.7862, "grad_norm": 1.3722974061965942, "learning_rate": 0.0002, "epoch": 1.5777777777777777, "step": 710}, {"loss": 1.7424, "grad_norm": 0.3965230882167816, "learning_rate": 0.0002, "epoch": 1.6, "step": 720}, {"loss": 1.7782, "grad_norm": 0.3842000663280487, "learning_rate": 0.0002, "epoch": 1.6222222222222222, "step": 730}, {"loss": 1.7682, "grad_norm": 0.3603688180446625, "learning_rate": 0.0002, "epoch": 1.6444444444444444, "step": 740}, {"loss": 1.7476, "grad_norm": 0.39973509311676025, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 750}, {"loss": 1.7329, "grad_norm": 0.3687385618686676, "learning_rate": 0.0002, "epoch": 1.6888888888888889, "step": 760}, {"loss": 1.7987, "grad_norm": 0.4267722964286804, "learning_rate": 0.0002, "epoch": 1.7111111111111112, "step": 770}, {"loss": 1.8041, "grad_norm": 0.41301295161247253, "learning_rate": 0.0002, "epoch": 1.7333333333333334, "step": 780}, {"loss": 1.7284, "grad_norm": 0.3945430517196655, "learning_rate": 0.0002, "epoch": 1.7555555555555555, "step": 790}, {"loss": 1.6691, "grad_norm": 0.4037930965423584, "learning_rate": 0.0002, "epoch": 1.7777777777777777, "step": 800}, {"loss": 1.6874, "grad_norm": 0.406893253326416, "learning_rate": 0.0002, "epoch": 1.8, "step": 810}, {"loss": 1.7628, "grad_norm": 0.4600457549095154, "learning_rate": 0.0002, "epoch": 1.8222222222222222, "step": 820}, {"loss": 1.8222, "grad_norm": 0.4195384085178375, "learning_rate": 0.0002, "epoch": 1.8444444444444446, "step": 830}, {"loss": 1.7123, "grad_norm": 0.3854130506515503, "learning_rate": 0.0002, "epoch": 1.8666666666666667, "step": 840}, {"loss": 1.7227, "grad_norm": 0.38279038667678833, "learning_rate": 0.0002, "epoch": 1.8888888888888888, "step": 850}, {"loss": 1.6942, "grad_norm": 0.38249439001083374, "learning_rate": 0.0002, "epoch": 1.911111111111111, "step": 860}, {"loss": 1.8058, "grad_norm": 0.42977792024612427, "learning_rate": 0.0002, "epoch": 1.9333333333333333, "step": 870}, {"loss": 1.713, "grad_norm": 0.4109351933002472, "learning_rate": 0.0002, "epoch": 1.9555555555555557, "step": 880}, {"loss": 1.7035, "grad_norm": 0.3734486699104309, "learning_rate": 0.0002, "epoch": 1.9777777777777779, "step": 890}, {"loss": 1.7502, "grad_norm": 0.3603087067604065, "learning_rate": 0.0002, "epoch": 2.0, "step": 900}, {"eval_loss": 1.8250652551651, "eval_runtime": 38.8657, "eval_samples_per_second": 13.251, "eval_steps_per_second": 1.672, "epoch": 2.0, "step": 900}, {"loss": 1.6063, "grad_norm": 0.4014144241809845, "learning_rate": 0.0002, "epoch": 2.022222222222222, "step": 910}, {"loss": 1.6604, "grad_norm": 0.4338063597679138, "learning_rate": 0.0002, "epoch": 2.0444444444444443, "step": 920}, {"loss": 1.6354, "grad_norm": 0.3693605065345764, "learning_rate": 0.0002, "epoch": 2.066666666666667, "step": 930}, {"loss": 1.6186, "grad_norm": 0.4040255844593048, "learning_rate": 0.0002, "epoch": 2.088888888888889, "step": 940}, {"loss": 1.5976, "grad_norm": 0.43481820821762085, "learning_rate": 0.0002, "epoch": 2.111111111111111, "step": 950}, {"loss": 1.6081, "grad_norm": 0.41632869839668274, "learning_rate": 0.0002, "epoch": 2.1333333333333333, "step": 960}, {"loss": 1.6586, "grad_norm": 0.4633755385875702, "learning_rate": 0.0002, "epoch": 2.1555555555555554, "step": 970}, {"loss": 1.7396, "grad_norm": 0.43926581740379333, "learning_rate": 0.0002, "epoch": 2.1777777777777776, "step": 980}, {"loss": 1.5909, "grad_norm": 0.4757233262062073, "learning_rate": 0.0002, "epoch": 2.2, "step": 990}, {"loss": 1.648, "grad_norm": 0.5010586977005005, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 1000}, {"loss": 1.6454, "grad_norm": 0.44900986552238464, "learning_rate": 0.0002, "epoch": 2.2444444444444445, "step": 1010}, {"loss": 1.6493, "grad_norm": 0.41274750232696533, "learning_rate": 0.0002, "epoch": 2.2666666666666666, "step": 1020}, {"loss": 1.6592, "grad_norm": 0.44672393798828125, "learning_rate": 0.0002, "epoch": 2.2888888888888888, "step": 1030}, {"loss": 1.6349, "grad_norm": 0.4826269745826721, "learning_rate": 0.0002, "epoch": 2.311111111111111, "step": 1040}, {"loss": 1.5988, "grad_norm": 0.4650685489177704, "learning_rate": 0.0002, "epoch": 2.3333333333333335, "step": 1050}, {"loss": 1.5859, "grad_norm": 0.42507848143577576, "learning_rate": 0.0002, "epoch": 2.3555555555555556, "step": 1060}, {"loss": 1.5932, "grad_norm": 0.45653030276298523, "learning_rate": 0.0002, "epoch": 2.3777777777777778, "step": 1070}, {"loss": 1.6469, "grad_norm": 0.44534122943878174, "learning_rate": 0.0002, "epoch": 2.4, "step": 1080}, {"loss": 1.6115, "grad_norm": 0.4241289794445038, "learning_rate": 0.0002, "epoch": 2.422222222222222, "step": 1090}, {"loss": 1.6935, "grad_norm": 0.5004808306694031, "learning_rate": 0.0002, "epoch": 2.4444444444444446, "step": 1100}, {"loss": 1.6833, "grad_norm": 0.41425490379333496, "learning_rate": 0.0002, "epoch": 2.466666666666667, "step": 1110}, {"loss": 1.6151, "grad_norm": 0.44362279772758484, "learning_rate": 0.0002, "epoch": 2.488888888888889, "step": 1120}, {"loss": 1.6394, "grad_norm": 0.5530985593795776, "learning_rate": 0.0002, "epoch": 2.511111111111111, "step": 1130}, {"loss": 1.64, "grad_norm": 0.4290637969970703, "learning_rate": 0.0002, "epoch": 2.533333333333333, "step": 1140}, {"loss": 1.76, "grad_norm": 0.4957487881183624, "learning_rate": 0.0002, "epoch": 2.5555555555555554, "step": 1150}, {"loss": 1.613, "grad_norm": 0.5082747340202332, "learning_rate": 0.0002, "epoch": 2.5777777777777775, "step": 1160}, {"loss": 1.6702, "grad_norm": 0.478722482919693, "learning_rate": 0.0002, "epoch": 2.6, "step": 1170}, {"loss": 1.6198, "grad_norm": 0.436454176902771, "learning_rate": 0.0002, "epoch": 2.6222222222222222, "step": 1180}, {"loss": 1.663, "grad_norm": 0.4905032515525818, "learning_rate": 0.0002, "epoch": 2.6444444444444444, "step": 1190}, {"loss": 1.6216, "grad_norm": 0.4815700054168701, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 1200}, {"loss": 1.6994, "grad_norm": 0.3965534269809723, "learning_rate": 0.0002, "epoch": 2.688888888888889, "step": 1210}, {"loss": 1.744, "grad_norm": 0.43282169103622437, "learning_rate": 0.0002, "epoch": 2.7111111111111112, "step": 1220}, {"loss": 1.6455, "grad_norm": 0.45512479543685913, "learning_rate": 0.0002, "epoch": 2.7333333333333334, "step": 1230}, {"loss": 1.6444, "grad_norm": 0.44370076060295105, "learning_rate": 0.0002, "epoch": 2.7555555555555555, "step": 1240}, {"loss": 1.7106, "grad_norm": 0.4750686287879944, "learning_rate": 0.0002, "epoch": 2.7777777777777777, "step": 1250}, {"loss": 1.7122, "grad_norm": 0.41953766345977783, "learning_rate": 0.0002, "epoch": 2.8, "step": 1260}, {"loss": 1.6203, "grad_norm": 0.4887140095233917, "learning_rate": 0.0002, "epoch": 2.822222222222222, "step": 1270}, {"loss": 1.6691, "grad_norm": 0.46718958020210266, "learning_rate": 0.0002, "epoch": 2.8444444444444446, "step": 1280}, {"loss": 1.6257, "grad_norm": 0.48510900139808655, "learning_rate": 0.0002, "epoch": 2.8666666666666667, "step": 1290}, {"loss": 1.6825, "grad_norm": 0.4504084289073944, "learning_rate": 0.0002, "epoch": 2.888888888888889, "step": 1300}, {"loss": 1.7453, "grad_norm": 0.42119622230529785, "learning_rate": 0.0002, "epoch": 2.911111111111111, "step": 1310}, {"loss": 1.6662, "grad_norm": 0.4763694107532501, "learning_rate": 0.0002, "epoch": 2.9333333333333336, "step": 1320}, {"loss": 1.6151, "grad_norm": 0.422810822725296, "learning_rate": 0.0002, "epoch": 2.9555555555555557, "step": 1330}, {"loss": 1.6578, "grad_norm": 0.4768871068954468, "learning_rate": 0.0002, "epoch": 2.977777777777778, "step": 1340}, {"loss": 1.6167, "grad_norm": 0.48259881138801575, "learning_rate": 0.0002, "epoch": 3.0, "step": 1350}, {"eval_loss": 1.8452560901641846, "eval_runtime": 38.8621, "eval_samples_per_second": 13.252, "eval_steps_per_second": 1.673, "epoch": 3.0, "step": 1350}, {"loss": 1.5351, "grad_norm": 0.6933313012123108, "learning_rate": 0.0002, "epoch": 3.022222222222222, "step": 1360}, {"loss": 1.5542, "grad_norm": 0.5870710611343384, "learning_rate": 0.0002, "epoch": 3.0444444444444443, "step": 1370}, {"loss": 1.511, "grad_norm": 0.602210283279419, "learning_rate": 0.0002, "epoch": 3.066666666666667, "step": 1380}, {"loss": 1.5272, "grad_norm": 0.6461787819862366, "learning_rate": 0.0002, "epoch": 3.088888888888889, "step": 1390}, {"loss": 1.4813, "grad_norm": 0.5839587450027466, "learning_rate": 0.0002, "epoch": 3.111111111111111, "step": 1400}, {"loss": 1.505, "grad_norm": 0.5757876038551331, "learning_rate": 0.0002, "epoch": 3.1333333333333333, "step": 1410}, {"loss": 1.4963, "grad_norm": 0.5862616300582886, "learning_rate": 0.0002, "epoch": 3.1555555555555554, "step": 1420}, {"loss": 1.5144, "grad_norm": 0.6103630065917969, "learning_rate": 0.0002, "epoch": 3.1777777777777776, "step": 1430}, {"loss": 1.5406, "grad_norm": 0.9309254884719849, "learning_rate": 0.0002, "epoch": 3.2, "step": 1440}, {"loss": 1.487, "grad_norm": 0.5360018014907837, "learning_rate": 0.0002, "epoch": 3.2222222222222223, "step": 1450}, {"loss": 1.5659, "grad_norm": 0.5448758602142334, "learning_rate": 0.0002, "epoch": 3.2444444444444445, "step": 1460}, {"loss": 1.5595, "grad_norm": 0.5973812341690063, "learning_rate": 0.0002, "epoch": 3.2666666666666666, "step": 1470}, {"loss": 1.5223, "grad_norm": 0.6245622038841248, "learning_rate": 0.0002, "epoch": 3.2888888888888888, "step": 1480}, {"loss": 1.4795, "grad_norm": 0.6533768773078918, "learning_rate": 0.0002, "epoch": 3.311111111111111, "step": 1490}, {"loss": 1.5562, "grad_norm": 0.5765811204910278, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 1500}, {"loss": 1.5405, "grad_norm": 0.591395378112793, "learning_rate": 0.0002, "epoch": 3.3555555555555556, "step": 1510}, {"loss": 1.5658, "grad_norm": 0.5842425227165222, "learning_rate": 0.0002, "epoch": 3.3777777777777778, "step": 1520}, {"loss": 1.5065, "grad_norm": 0.5731365084648132, "learning_rate": 0.0002, "epoch": 3.4, "step": 1530}, {"loss": 1.5438, "grad_norm": 0.5841306447982788, "learning_rate": 0.0002, "epoch": 3.422222222222222, "step": 1540}, {"loss": 1.4922, "grad_norm": 0.6503536701202393, "learning_rate": 0.0002, "epoch": 3.4444444444444446, "step": 1550}, {"loss": 1.5493, "grad_norm": 0.6170967221260071, "learning_rate": 0.0002, "epoch": 3.466666666666667, "step": 1560}, {"loss": 1.5098, "grad_norm": 0.5576487183570862, "learning_rate": 0.0002, "epoch": 3.488888888888889, "step": 1570}, {"loss": 1.472, "grad_norm": 0.7082911133766174, "learning_rate": 0.0002, "epoch": 3.511111111111111, "step": 1580}, {"loss": 1.5594, "grad_norm": 0.6159376502037048, "learning_rate": 0.0002, "epoch": 3.533333333333333, "step": 1590}, {"loss": 1.563, "grad_norm": 0.5972959399223328, "learning_rate": 0.0002, "epoch": 3.5555555555555554, "step": 1600}, {"loss": 1.4876, "grad_norm": 0.5787310004234314, "learning_rate": 0.0002, "epoch": 3.5777777777777775, "step": 1610}, {"loss": 1.4887, "grad_norm": 0.5846341252326965, "learning_rate": 0.0002, "epoch": 3.6, "step": 1620}, {"loss": 1.542, "grad_norm": 0.5906197428703308, "learning_rate": 0.0002, "epoch": 3.6222222222222222, "step": 1630}, {"loss": 1.4941, "grad_norm": 0.6305760145187378, "learning_rate": 0.0002, "epoch": 3.6444444444444444, "step": 1640}, {"loss": 1.4677, "grad_norm": 0.7448979616165161, "learning_rate": 0.0002, "epoch": 3.6666666666666665, "step": 1650}, {"loss": 1.5961, "grad_norm": 0.5906165242195129, "learning_rate": 0.0002, "epoch": 3.688888888888889, "step": 1660}, {"loss": 1.4882, "grad_norm": 0.605032742023468, "learning_rate": 0.0002, "epoch": 3.7111111111111112, "step": 1670}, {"loss": 1.5804, "grad_norm": 0.6117229461669922, "learning_rate": 0.0002, "epoch": 3.7333333333333334, "step": 1680}, {"loss": 1.5131, "grad_norm": 0.613581120967865, "learning_rate": 0.0002, "epoch": 3.7555555555555555, "step": 1690}, {"loss": 1.5074, "grad_norm": 0.6244436502456665, "learning_rate": 0.0002, "epoch": 3.7777777777777777, "step": 1700}, {"loss": 1.5738, "grad_norm": 0.6236702799797058, "learning_rate": 0.0002, "epoch": 3.8, "step": 1710}, {"loss": 1.6542, "grad_norm": 0.639141857624054, "learning_rate": 0.0002, "epoch": 3.822222222222222, "step": 1720}, {"loss": 1.536, "grad_norm": 0.5782344937324524, "learning_rate": 0.0002, "epoch": 3.8444444444444446, "step": 1730}, {"loss": 1.5355, "grad_norm": 0.5952938795089722, "learning_rate": 0.0002, "epoch": 3.8666666666666667, "step": 1740}, {"loss": 1.5205, "grad_norm": 0.5573042035102844, "learning_rate": 0.0002, "epoch": 3.888888888888889, "step": 1750}, {"loss": 1.5066, "grad_norm": 0.6114351749420166, "learning_rate": 0.0002, "epoch": 3.911111111111111, "step": 1760}, {"loss": 1.5706, "grad_norm": 0.5973817110061646, "learning_rate": 0.0002, "epoch": 3.9333333333333336, "step": 1770}, {"loss": 1.5003, "grad_norm": 0.602317750453949, "learning_rate": 0.0002, "epoch": 3.9555555555555557, "step": 1780}, {"loss": 1.5022, "grad_norm": 0.5965437293052673, "learning_rate": 0.0002, "epoch": 3.977777777777778, "step": 1790}, {"loss": 1.5031, "grad_norm": 0.5641552209854126, "learning_rate": 0.0002, "epoch": 4.0, "step": 1800}, {"eval_loss": 1.892098069190979, "eval_runtime": 38.8755, "eval_samples_per_second": 13.247, "eval_steps_per_second": 1.672, "epoch": 4.0, "step": 1800}, {"loss": 1.3894, "grad_norm": 0.8302594423294067, "learning_rate": 0.0002, "epoch": 4.022222222222222, "step": 1810}, {"loss": 1.3727, "grad_norm": 0.6695230603218079, "learning_rate": 0.0002, "epoch": 4.044444444444444, "step": 1820}, {"loss": 1.3064, "grad_norm": 0.7911471128463745, "learning_rate": 0.0002, "epoch": 4.066666666666666, "step": 1830}, {"loss": 1.4574, "grad_norm": 0.7044888138771057, "learning_rate": 0.0002, "epoch": 4.088888888888889, "step": 1840}, {"loss": 1.3941, "grad_norm": 0.7057249546051025, "learning_rate": 0.0002, "epoch": 4.111111111111111, "step": 1850}, {"loss": 1.4052, "grad_norm": 0.8762815594673157, "learning_rate": 0.0002, "epoch": 4.133333333333334, "step": 1860}, {"loss": 1.3784, "grad_norm": 0.7619158029556274, "learning_rate": 0.0002, "epoch": 4.155555555555556, "step": 1870}, {"loss": 1.3581, "grad_norm": 0.7711658477783203, "learning_rate": 0.0002, "epoch": 4.177777777777778, "step": 1880}, {"loss": 1.3995, "grad_norm": 0.9732598662376404, "learning_rate": 0.0002, "epoch": 4.2, "step": 1890}, {"loss": 1.3353, "grad_norm": 0.9070265889167786, "learning_rate": 0.0002, "epoch": 4.222222222222222, "step": 1900}, {"loss": 1.3947, "grad_norm": 0.8274767994880676, "learning_rate": 0.0002, "epoch": 4.2444444444444445, "step": 1910}, {"loss": 1.3392, "grad_norm": 0.8514227271080017, "learning_rate": 0.0002, "epoch": 4.266666666666667, "step": 1920}, {"loss": 1.3492, "grad_norm": 0.7356534600257874, "learning_rate": 0.0002, "epoch": 4.288888888888889, "step": 1930}, {"loss": 1.3708, "grad_norm": 0.8226608037948608, "learning_rate": 0.0002, "epoch": 4.311111111111111, "step": 1940}, {"loss": 1.3652, "grad_norm": 0.8347907066345215, "learning_rate": 0.0002, "epoch": 4.333333333333333, "step": 1950}, {"loss": 1.3415, "grad_norm": 0.8509323000907898, "learning_rate": 0.0002, "epoch": 4.355555555555555, "step": 1960}, {"loss": 1.3796, "grad_norm": 0.8776063323020935, "learning_rate": 0.0002, "epoch": 4.377777777777778, "step": 1970}, {"loss": 1.438, "grad_norm": 0.8022271990776062, "learning_rate": 0.0002, "epoch": 4.4, "step": 1980}, {"loss": 1.3671, "grad_norm": 0.7984752058982849, "learning_rate": 0.0002, "epoch": 4.4222222222222225, "step": 1990}, {"loss": 1.4214, "grad_norm": 0.7349720001220703, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 2000}, {"loss": 1.4174, "grad_norm": 0.7778817415237427, "learning_rate": 0.0002, "epoch": 4.466666666666667, "step": 2010}, {"loss": 1.3365, "grad_norm": 0.9361467361450195, "learning_rate": 0.0002, "epoch": 4.488888888888889, "step": 2020}, {"loss": 1.4129, "grad_norm": 0.7839348912239075, "learning_rate": 0.0002, "epoch": 4.511111111111111, "step": 2030}, {"loss": 1.3761, "grad_norm": 0.8361981511116028, "learning_rate": 0.0002, "epoch": 4.533333333333333, "step": 2040}, {"loss": 1.4085, "grad_norm": 1.9877147674560547, "learning_rate": 0.0002, "epoch": 4.555555555555555, "step": 2050}, {"loss": 1.329, "grad_norm": 0.7506140470504761, "learning_rate": 0.0002, "epoch": 4.5777777777777775, "step": 2060}, {"loss": 1.3557, "grad_norm": 0.9493570327758789, "learning_rate": 0.0002, "epoch": 4.6, "step": 2070}, {"loss": 1.438, "grad_norm": 0.7198925018310547, "learning_rate": 0.0002, "epoch": 4.622222222222222, "step": 2080}, {"loss": 1.3892, "grad_norm": 0.7521472573280334, "learning_rate": 0.0002, "epoch": 4.644444444444445, "step": 2090}, {"loss": 1.3833, "grad_norm": 0.766718327999115, "learning_rate": 0.0002, "epoch": 4.666666666666667, "step": 2100}, {"loss": 1.3541, "grad_norm": 0.9162390232086182, "learning_rate": 0.0002, "epoch": 4.688888888888889, "step": 2110}, {"loss": 1.4603, "grad_norm": 0.8980328440666199, "learning_rate": 0.0002, "epoch": 4.711111111111111, "step": 2120}, {"loss": 1.4043, "grad_norm": 0.8109711408615112, "learning_rate": 0.0002, "epoch": 4.733333333333333, "step": 2130}, {"loss": 1.373, "grad_norm": 0.7372606992721558, "learning_rate": 0.0002, "epoch": 4.7555555555555555, "step": 2140}, {"loss": 1.4439, "grad_norm": 0.7527457475662231, "learning_rate": 0.0002, "epoch": 4.777777777777778, "step": 2150}, {"loss": 1.2999, "grad_norm": 1.0380001068115234, "learning_rate": 0.0002, "epoch": 4.8, "step": 2160}, {"loss": 1.3562, "grad_norm": 0.7166368365287781, "learning_rate": 0.0002, "epoch": 4.822222222222222, "step": 2170}, {"loss": 1.3917, "grad_norm": 0.784548282623291, "learning_rate": 0.0002, "epoch": 4.844444444444444, "step": 2180}, {"loss": 1.3376, "grad_norm": 0.7771317958831787, "learning_rate": 0.0002, "epoch": 4.866666666666667, "step": 2190}, {"loss": 1.3315, "grad_norm": 0.7710300087928772, "learning_rate": 0.0002, "epoch": 4.888888888888889, "step": 2200}, {"loss": 1.3676, "grad_norm": 0.7715084552764893, "learning_rate": 0.0002, "epoch": 4.911111111111111, "step": 2210}, {"loss": 1.5352, "grad_norm": 0.7888006567955017, "learning_rate": 0.0002, "epoch": 4.933333333333334, "step": 2220}, {"loss": 1.4139, "grad_norm": 0.800684928894043, "learning_rate": 0.0002, "epoch": 4.955555555555556, "step": 2230}, {"loss": 1.4343, "grad_norm": 0.7710039019584656, "learning_rate": 0.0002, "epoch": 4.977777777777778, "step": 2240}, {"loss": 1.3501, "grad_norm": 0.8617033958435059, "learning_rate": 0.0002, "epoch": 5.0, "step": 2250}, {"eval_loss": 1.9718151092529297, "eval_runtime": 38.8999, "eval_samples_per_second": 13.239, "eval_steps_per_second": 1.671, "epoch": 5.0, "step": 2250}, {"loss": 1.19, "grad_norm": 1.07399582862854, "learning_rate": 0.0002, "epoch": 5.022222222222222, "step": 2260}, {"loss": 1.2299, "grad_norm": 0.6598460674285889, "learning_rate": 0.0002, "epoch": 5.044444444444444, "step": 2270}, {"loss": 1.2333, "grad_norm": 1.1039506196975708, "learning_rate": 0.0002, "epoch": 5.066666666666666, "step": 2280}, {"loss": 1.2412, "grad_norm": 1.0624054670333862, "learning_rate": 0.0002, "epoch": 5.088888888888889, "step": 2290}, {"loss": 1.184, "grad_norm": 0.849583625793457, "learning_rate": 0.0002, "epoch": 5.111111111111111, "step": 2300}, {"loss": 1.1884, "grad_norm": 1.0143699645996094, "learning_rate": 0.0002, "epoch": 5.133333333333334, "step": 2310}, {"loss": 1.2133, "grad_norm": 0.8990702629089355, "learning_rate": 0.0002, "epoch": 5.155555555555556, "step": 2320}, {"loss": 1.2091, "grad_norm": 0.9822764992713928, "learning_rate": 0.0002, "epoch": 5.177777777777778, "step": 2330}, {"loss": 1.1775, "grad_norm": 0.9632459282875061, "learning_rate": 0.0002, "epoch": 5.2, "step": 2340}, {"loss": 1.1821, "grad_norm": 1.0897903442382812, "learning_rate": 0.0002, "epoch": 5.222222222222222, "step": 2350}, {"loss": 1.2976, "grad_norm": 1.155950665473938, "learning_rate": 0.0002, "epoch": 5.2444444444444445, "step": 2360}, {"loss": 1.1662, "grad_norm": 1.0566821098327637, "learning_rate": 0.0002, "epoch": 5.266666666666667, "step": 2370}, {"loss": 1.2809, "grad_norm": 1.191604733467102, "learning_rate": 0.0002, "epoch": 5.288888888888889, "step": 2380}, {"loss": 1.2431, "grad_norm": 0.852453887462616, "learning_rate": 0.0002, "epoch": 5.311111111111111, "step": 2390}, {"loss": 1.2106, "grad_norm": 0.9649669528007507, "learning_rate": 0.0002, "epoch": 5.333333333333333, "step": 2400}, {"loss": 1.2433, "grad_norm": 1.0731003284454346, "learning_rate": 0.0002, "epoch": 5.355555555555555, "step": 2410}, {"loss": 1.1737, "grad_norm": 0.9628495573997498, "learning_rate": 0.0002, "epoch": 5.377777777777778, "step": 2420}, {"loss": 1.3166, "grad_norm": 0.9268819093704224, "learning_rate": 0.0002, "epoch": 5.4, "step": 2430}, {"loss": 1.2114, "grad_norm": 1.1104000806808472, "learning_rate": 0.0002, "epoch": 5.4222222222222225, "step": 2440}, {"loss": 1.2151, "grad_norm": 1.0439373254776, "learning_rate": 0.0002, "epoch": 5.444444444444445, "step": 2450}, {"loss": 1.2458, "grad_norm": 1.0366657972335815, "learning_rate": 0.0002, "epoch": 5.466666666666667, "step": 2460}, {"loss": 1.2021, "grad_norm": 1.0604808330535889, "learning_rate": 0.0002, "epoch": 5.488888888888889, "step": 2470}, {"loss": 1.2188, "grad_norm": 0.8845253586769104, "learning_rate": 0.0002, "epoch": 5.511111111111111, "step": 2480}, {"loss": 1.2296, "grad_norm": 0.8200256824493408, "learning_rate": 0.0002, "epoch": 5.533333333333333, "step": 2490}, {"loss": 1.2632, "grad_norm": 0.9628723859786987, "learning_rate": 0.0002, "epoch": 5.555555555555555, "step": 2500}, {"loss": 1.2723, "grad_norm": 1.0758650302886963, "learning_rate": 0.0002, "epoch": 5.5777777777777775, "step": 2510}, {"loss": 1.2298, "grad_norm": 1.0113487243652344, "learning_rate": 0.0002, "epoch": 5.6, "step": 2520}, {"loss": 1.2226, "grad_norm": 1.260536551475525, "learning_rate": 0.0002, "epoch": 5.622222222222222, "step": 2530}, {"loss": 1.227, "grad_norm": 0.9229527115821838, "learning_rate": 0.0002, "epoch": 5.644444444444445, "step": 2540}, {"loss": 1.2223, "grad_norm": 0.9378697276115417, "learning_rate": 0.0002, "epoch": 5.666666666666667, "step": 2550}, {"loss": 1.2759, "grad_norm": 1.0404350757598877, "learning_rate": 0.0002, "epoch": 5.688888888888889, "step": 2560}, {"loss": 1.2132, "grad_norm": 1.1879961490631104, "learning_rate": 0.0002, "epoch": 5.711111111111111, "step": 2570}, {"loss": 1.2181, "grad_norm": 0.8881482481956482, "learning_rate": 0.0002, "epoch": 5.733333333333333, "step": 2580}, {"loss": 1.2419, "grad_norm": 1.1428065299987793, "learning_rate": 0.0002, "epoch": 5.7555555555555555, "step": 2590}, {"loss": 1.2682, "grad_norm": 0.8970609903335571, "learning_rate": 0.0002, "epoch": 5.777777777777778, "step": 2600}, {"loss": 1.2285, "grad_norm": 1.2084497213363647, "learning_rate": 0.0002, "epoch": 5.8, "step": 2610}, {"loss": 1.2004, "grad_norm": 1.04214608669281, "learning_rate": 0.0002, "epoch": 5.822222222222222, "step": 2620}, {"loss": 1.2388, "grad_norm": 1.0671849250793457, "learning_rate": 0.0002, "epoch": 5.844444444444444, "step": 2630}, {"loss": 1.1714, "grad_norm": 1.009602427482605, "learning_rate": 0.0002, "epoch": 5.866666666666667, "step": 2640}, {"loss": 1.2292, "grad_norm": 0.9787904024124146, "learning_rate": 0.0002, "epoch": 5.888888888888889, "step": 2650}, {"loss": 1.2404, "grad_norm": 1.0043761730194092, "learning_rate": 0.0002, "epoch": 5.911111111111111, "step": 2660}, {"loss": 1.2712, "grad_norm": 0.9855443239212036, "learning_rate": 0.0002, "epoch": 5.933333333333334, "step": 2670}, {"loss": 1.3112, "grad_norm": 1.1488507986068726, "learning_rate": 0.0002, "epoch": 5.955555555555556, "step": 2680}, {"loss": 1.2576, "grad_norm": 0.9939966797828674, "learning_rate": 0.0002, "epoch": 5.977777777777778, "step": 2690}, {"loss": 1.2847, "grad_norm": 1.0444952249526978, "learning_rate": 0.0002, "epoch": 6.0, "step": 2700}, {"eval_loss": 2.0881619453430176, "eval_runtime": 39.6891, "eval_samples_per_second": 12.976, "eval_steps_per_second": 1.638, "epoch": 6.0, "step": 2700}, {"loss": 1.0764, "grad_norm": 1.3728636503219604, "learning_rate": 0.0002, "epoch": 6.022222222222222, "step": 2710}, {"loss": 1.0778, "grad_norm": 1.06633460521698, "learning_rate": 0.0002, "epoch": 6.044444444444444, "step": 2720}, {"loss": 1.0181, "grad_norm": 1.2068440914154053, "learning_rate": 0.0002, "epoch": 6.066666666666666, "step": 2730}, {"loss": 1.0225, "grad_norm": 1.248744010925293, "learning_rate": 0.0002, "epoch": 6.088888888888889, "step": 2740}, {"loss": 1.0885, "grad_norm": 1.1814687252044678, "learning_rate": 0.0002, "epoch": 6.111111111111111, "step": 2750}, {"loss": 0.973, "grad_norm": 1.2335790395736694, "learning_rate": 0.0002, "epoch": 6.133333333333334, "step": 2760}, {"loss": 1.0193, "grad_norm": 1.0661171674728394, "learning_rate": 0.0002, "epoch": 6.155555555555556, "step": 2770}, {"loss": 1.0496, "grad_norm": 1.345876932144165, "learning_rate": 0.0002, "epoch": 6.177777777777778, "step": 2780}, {"loss": 1.0252, "grad_norm": 1.2426252365112305, "learning_rate": 0.0002, "epoch": 6.2, "step": 2790}, {"loss": 1.0075, "grad_norm": 1.1970592737197876, "learning_rate": 0.0002, "epoch": 6.222222222222222, "step": 2800}, {"loss": 1.1016, "grad_norm": 1.2484612464904785, "learning_rate": 0.0002, "epoch": 6.2444444444444445, "step": 2810}, {"loss": 1.0032, "grad_norm": 1.2115106582641602, "learning_rate": 0.0002, "epoch": 6.266666666666667, "step": 2820}, {"loss": 1.0721, "grad_norm": 1.0024933815002441, "learning_rate": 0.0002, "epoch": 6.288888888888889, "step": 2830}, {"loss": 1.0705, "grad_norm": 1.1508114337921143, "learning_rate": 0.0002, "epoch": 6.311111111111111, "step": 2840}, {"loss": 1.0632, "grad_norm": 1.1686254739761353, "learning_rate": 0.0002, "epoch": 6.333333333333333, "step": 2850}, {"loss": 1.1031, "grad_norm": 1.2702640295028687, "learning_rate": 0.0002, "epoch": 6.355555555555555, "step": 2860}, {"loss": 1.1033, "grad_norm": 1.3344615697860718, "learning_rate": 0.0002, "epoch": 6.377777777777778, "step": 2870}, {"loss": 1.1105, "grad_norm": 1.27545964717865, "learning_rate": 0.0002, "epoch": 6.4, "step": 2880}, {"loss": 1.0353, "grad_norm": 1.2365739345550537, "learning_rate": 0.0002, "epoch": 6.4222222222222225, "step": 2890}, {"loss": 1.046, "grad_norm": 1.3821545839309692, "learning_rate": 0.0002, "epoch": 6.444444444444445, "step": 2900}, {"loss": 1.0643, "grad_norm": 1.1889359951019287, "learning_rate": 0.0002, "epoch": 6.466666666666667, "step": 2910}, {"loss": 1.0173, "grad_norm": 1.1324981451034546, "learning_rate": 0.0002, "epoch": 6.488888888888889, "step": 2920}, {"loss": 1.0474, "grad_norm": 1.154468297958374, "learning_rate": 0.0002, "epoch": 6.511111111111111, "step": 2930}, {"loss": 1.1323, "grad_norm": 1.211300253868103, "learning_rate": 0.0002, "epoch": 6.533333333333333, "step": 2940}, {"loss": 1.0901, "grad_norm": 1.3322433233261108, "learning_rate": 0.0002, "epoch": 6.555555555555555, "step": 2950}, {"loss": 1.0636, "grad_norm": 1.2570568323135376, "learning_rate": 0.0002, "epoch": 6.5777777777777775, "step": 2960}, {"loss": 1.1093, "grad_norm": 1.2037729024887085, "learning_rate": 0.0002, "epoch": 6.6, "step": 2970}, {"loss": 1.0355, "grad_norm": 1.2894154787063599, "learning_rate": 0.0002, "epoch": 6.622222222222222, "step": 2980}, {"loss": 0.9846, "grad_norm": 1.1682062149047852, "learning_rate": 0.0002, "epoch": 6.644444444444445, "step": 2990}, {"loss": 1.1292, "grad_norm": 1.6112759113311768, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 3000}, {"loss": 1.1831, "grad_norm": 1.227586269378662, "learning_rate": 0.0002, "epoch": 6.688888888888889, "step": 3010}, {"loss": 1.1656, "grad_norm": 1.2558735609054565, "learning_rate": 0.0002, "epoch": 6.711111111111111, "step": 3020}, {"loss": 1.1151, "grad_norm": 1.2739307880401611, "learning_rate": 0.0002, "epoch": 6.733333333333333, "step": 3030}, {"loss": 1.0957, "grad_norm": 1.2761014699935913, "learning_rate": 0.0002, "epoch": 6.7555555555555555, "step": 3040}, {"loss": 1.0863, "grad_norm": 1.308904767036438, "learning_rate": 0.0002, "epoch": 6.777777777777778, "step": 3050}, {"loss": 1.1072, "grad_norm": 1.6273704767227173, "learning_rate": 0.0002, "epoch": 6.8, "step": 3060}, {"loss": 1.0982, "grad_norm": 1.3006200790405273, "learning_rate": 0.0002, "epoch": 6.822222222222222, "step": 3070}, {"loss": 1.091, "grad_norm": 1.2942757606506348, "learning_rate": 0.0002, "epoch": 6.844444444444444, "step": 3080}, {"loss": 1.0371, "grad_norm": 1.3074650764465332, "learning_rate": 0.0002, "epoch": 6.866666666666667, "step": 3090}, {"loss": 1.0782, "grad_norm": 1.321811556816101, "learning_rate": 0.0002, "epoch": 6.888888888888889, "step": 3100}, {"loss": 1.1375, "grad_norm": 1.0926110744476318, "learning_rate": 0.0002, "epoch": 6.911111111111111, "step": 3110}, {"loss": 1.0966, "grad_norm": 1.3839191198349, "learning_rate": 0.0002, "epoch": 6.933333333333334, "step": 3120}, {"loss": 1.111, "grad_norm": 1.084396481513977, "learning_rate": 0.0002, "epoch": 6.955555555555556, "step": 3130}, {"loss": 1.0947, "grad_norm": 1.262983798980713, "learning_rate": 0.0002, "epoch": 6.977777777777778, "step": 3140}, {"loss": 1.099, "grad_norm": 1.1751209497451782, "learning_rate": 0.0002, "epoch": 7.0, "step": 3150}, {"eval_loss": 2.2316300868988037, "eval_runtime": 81.7348, "eval_samples_per_second": 6.301, "eval_steps_per_second": 0.795, "epoch": 7.0, "step": 3150}, {"loss": 0.9085, "grad_norm": 1.7097322940826416, "learning_rate": 0.0002, "epoch": 7.022222222222222, "step": 3160}, {"loss": 0.8524, "grad_norm": 1.287734031677246, "learning_rate": 0.0002, "epoch": 7.044444444444444, "step": 3170}, {"loss": 0.9244, "grad_norm": 1.680770993232727, "learning_rate": 0.0002, "epoch": 7.066666666666666, "step": 3180}, {"loss": 0.8847, "grad_norm": 1.3358803987503052, "learning_rate": 0.0002, "epoch": 7.088888888888889, "step": 3190}, {"loss": 0.9036, "grad_norm": 1.5450502634048462, "learning_rate": 0.0002, "epoch": 7.111111111111111, "step": 3200}, {"loss": 0.8995, "grad_norm": 1.5816127061843872, "learning_rate": 0.0002, "epoch": 7.133333333333334, "step": 3210}, {"loss": 0.8622, "grad_norm": 1.4042329788208008, "learning_rate": 0.0002, "epoch": 7.155555555555556, "step": 3220}, {"loss": 0.9034, "grad_norm": 1.3045488595962524, "learning_rate": 0.0002, "epoch": 7.177777777777778, "step": 3230}, {"loss": 0.8673, "grad_norm": 1.4329142570495605, "learning_rate": 0.0002, "epoch": 7.2, "step": 3240}, {"loss": 0.8642, "grad_norm": 1.4555209875106812, "learning_rate": 0.0002, "epoch": 7.222222222222222, "step": 3250}, {"loss": 0.8753, "grad_norm": 1.4156484603881836, "learning_rate": 0.0002, "epoch": 7.2444444444444445, "step": 3260}, {"loss": 0.9189, "grad_norm": 1.3839219808578491, "learning_rate": 0.0002, "epoch": 7.266666666666667, "step": 3270}, {"loss": 0.9091, "grad_norm": 1.409365177154541, "learning_rate": 0.0002, "epoch": 7.288888888888889, "step": 3280}, {"loss": 0.8671, "grad_norm": 1.3349004983901978, "learning_rate": 0.0002, "epoch": 7.311111111111111, "step": 3290}, {"loss": 0.9099, "grad_norm": 1.602988839149475, "learning_rate": 0.0002, "epoch": 7.333333333333333, "step": 3300}, {"loss": 0.8603, "grad_norm": 1.492713451385498, "learning_rate": 0.0002, "epoch": 7.355555555555555, "step": 3310}, {"loss": 0.8906, "grad_norm": 1.4347516298294067, "learning_rate": 0.0002, "epoch": 7.377777777777778, "step": 3320}, {"loss": 0.9412, "grad_norm": 1.5181629657745361, "learning_rate": 0.0002, "epoch": 7.4, "step": 3330}, {"loss": 0.8748, "grad_norm": 1.339322805404663, "learning_rate": 0.0002, "epoch": 7.4222222222222225, "step": 3340}, {"loss": 0.9323, "grad_norm": 1.6582218408584595, "learning_rate": 0.0002, "epoch": 7.444444444444445, "step": 3350}, {"loss": 0.8823, "grad_norm": 1.3226500749588013, "learning_rate": 0.0002, "epoch": 7.466666666666667, "step": 3360}, {"loss": 0.9468, "grad_norm": 1.6935880184173584, "learning_rate": 0.0002, "epoch": 7.488888888888889, "step": 3370}, {"loss": 0.9078, "grad_norm": 1.2704429626464844, "learning_rate": 0.0002, "epoch": 7.511111111111111, "step": 3380}, {"loss": 0.8829, "grad_norm": 1.4228342771530151, "learning_rate": 0.0002, "epoch": 7.533333333333333, "step": 3390}, {"loss": 0.9053, "grad_norm": 1.8575019836425781, "learning_rate": 0.0002, "epoch": 7.555555555555555, "step": 3400}, {"loss": 0.9372, "grad_norm": 1.4379228353500366, "learning_rate": 0.0002, "epoch": 7.5777777777777775, "step": 3410}, {"loss": 0.9009, "grad_norm": 1.4535613059997559, "learning_rate": 0.0002, "epoch": 7.6, "step": 3420}, {"loss": 0.9669, "grad_norm": 1.485689401626587, "learning_rate": 0.0002, "epoch": 7.622222222222222, "step": 3430}, {"loss": 0.9765, "grad_norm": 1.6231895685195923, "learning_rate": 0.0002, "epoch": 7.644444444444445, "step": 3440}, {"loss": 0.9607, "grad_norm": 1.5033475160598755, "learning_rate": 0.0002, "epoch": 7.666666666666667, "step": 3450}, {"loss": 0.9834, "grad_norm": 1.2845245599746704, "learning_rate": 0.0002, "epoch": 7.688888888888889, "step": 3460}, {"loss": 0.9956, "grad_norm": 1.3614885807037354, "learning_rate": 0.0002, "epoch": 7.711111111111111, "step": 3470}, {"loss": 0.9207, "grad_norm": 1.876365303993225, "learning_rate": 0.0002, "epoch": 7.733333333333333, "step": 3480}, {"loss": 0.9616, "grad_norm": 1.5048887729644775, "learning_rate": 0.0002, "epoch": 7.7555555555555555, "step": 3490}, {"loss": 0.9256, "grad_norm": 1.401036024093628, "learning_rate": 0.0002, "epoch": 7.777777777777778, "step": 3500}, {"loss": 0.8659, "grad_norm": 1.4172956943511963, "learning_rate": 0.0002, "epoch": 7.8, "step": 3510}, {"loss": 0.9575, "grad_norm": 1.3779038190841675, "learning_rate": 0.0002, "epoch": 7.822222222222222, "step": 3520}, {"loss": 0.9083, "grad_norm": 1.2683740854263306, "learning_rate": 0.0002, "epoch": 7.844444444444444, "step": 3530}, {"loss": 1.0045, "grad_norm": 1.3728152513504028, "learning_rate": 0.0002, "epoch": 7.866666666666667, "step": 3540}, {"loss": 0.9687, "grad_norm": 1.5868757963180542, "learning_rate": 0.0002, "epoch": 7.888888888888889, "step": 3550}, {"loss": 1.0928, "grad_norm": 1.520365595817566, "learning_rate": 0.0002, "epoch": 7.911111111111111, "step": 3560}, {"loss": 0.9233, "grad_norm": 1.6288018226623535, "learning_rate": 0.0002, "epoch": 7.933333333333334, "step": 3570}, {"loss": 1.0147, "grad_norm": 1.3921650648117065, "learning_rate": 0.0002, "epoch": 7.955555555555556, "step": 3580}, {"loss": 0.9452, "grad_norm": 1.486502766609192, "learning_rate": 0.0002, "epoch": 7.977777777777778, "step": 3590}, {"loss": 1.0061, "grad_norm": 1.4413995742797852, "learning_rate": 0.0002, "epoch": 8.0, "step": 3600}]}