diff --git a/.gitattributes b/.gitattributes index b0d6e7aafd525a99ca3458c2ee1fa0d1ae0c57c6..026d0e187b20ae282e73d9acea9928276b27dbb4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2287,3 +2287,12 @@ gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora- gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-8571/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-9792/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..87074a68c4f5128ebf7122489322243c5ca7bf60 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4be56739e6a8b31d70d9efd58e84a9bee31c6cd42259f753cf665b0028dd050 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..87074a68c4f5128ebf7122489322243c5ca7bf60 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4be56739e6a8b31d70d9efd58e84a9bee31c6cd42259f753cf665b0028dd050 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4b710b75858380d14e2a47f7082d4e8821bcd5f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13a852b7d558c7b3003258505c78d8081fd0d251f9f620d63bda29fea55086e1 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8fbbf2885b8e997c62a9008c13b0e915006e5655 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35f1ae2762cf3f69c02def0454bdcdba1947d1a01dd5cb18e3a3471c090e6e6f +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..14e9cfbd7b517c1f19f327cf5abfa1f22e2b4f10 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc6ead48cb4a46736eb3ffbf1e2906aa8a248b3322afe8a6838d7bc898885e10 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..57bd18154aeba0820b31a4092cf8b14686a3f792 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/trainer_state.json @@ -0,0 +1,868 @@ +{ + "best_metric": 1.8328146934509277, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 1175, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01702127659574468, + "grad_norm": 0.7596228122711182, + "learning_rate": 0.0002, + "loss": 2.5996, + "step": 10 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 0.4860903322696686, + "learning_rate": 0.0002, + "loss": 2.2941, + "step": 20 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 0.4953401982784271, + "learning_rate": 0.0002, + "loss": 2.0018, + "step": 30 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 0.5086901783943176, + "learning_rate": 0.0002, + "loss": 1.9318, + "step": 40 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 0.49050021171569824, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 50 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 0.4922358989715576, + "learning_rate": 0.0002, + "loss": 1.8786, + "step": 60 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 0.4621541202068329, + "learning_rate": 0.0002, + "loss": 1.8812, + "step": 70 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 0.4416729807853699, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 80 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 0.526258111000061, + "learning_rate": 0.0002, + "loss": 1.9298, + "step": 90 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 0.44022637605667114, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 100 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 0.4647711515426636, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 110 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 0.4136318564414978, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 120 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 0.39707672595977783, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 130 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 0.4478105306625366, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 140 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 0.4699741303920746, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 150 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 0.4568363130092621, + "learning_rate": 0.0002, + "loss": 1.8161, + "step": 160 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 0.45078757405281067, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 170 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 0.4127245843410492, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 180 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 0.4042493402957916, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 190 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 0.401487797498703, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 200 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 0.3959457576274872, + "learning_rate": 0.0002, + "loss": 1.8355, + "step": 210 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 0.39865636825561523, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 220 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 0.7225169539451599, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 230 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 0.412801593542099, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 240 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 0.40951448678970337, + "learning_rate": 0.0002, + "loss": 1.8523, + "step": 250 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 0.42788130044937134, + "learning_rate": 0.0002, + "loss": 1.7283, + "step": 260 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 0.41069576144218445, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 270 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 0.3745323717594147, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 280 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 0.3771323263645172, + "learning_rate": 0.0002, + "loss": 1.8484, + "step": 290 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 0.34368929266929626, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 300 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 0.4299296736717224, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 310 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 0.4133922755718231, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 320 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 0.3984859585762024, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 330 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 0.3822788894176483, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 340 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 0.4550061821937561, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 350 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 0.36571192741394043, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 360 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 0.32942914962768555, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 370 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 0.39299526810646057, + "learning_rate": 0.0002, + "loss": 1.7118, + "step": 380 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 0.3817657232284546, + "learning_rate": 0.0002, + "loss": 1.8179, + "step": 390 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 0.3650810122489929, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 0.3736686408519745, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 410 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 0.45680564641952515, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 420 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 0.4154510200023651, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 430 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 0.3701167106628418, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 440 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 0.3869531750679016, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 450 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 0.4391495883464813, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 460 + }, + { + "epoch": 0.8, + "grad_norm": 0.39652755856513977, + "learning_rate": 0.0002, + "loss": 1.7534, + "step": 470 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 0.4096752107143402, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 480 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 0.3857504427433014, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 490 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 0.4105374217033386, + "learning_rate": 0.0002, + "loss": 1.7379, + "step": 500 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 0.3723328113555908, + "learning_rate": 0.0002, + "loss": 1.6391, + "step": 510 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 0.36099690198898315, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 520 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 0.3715187907218933, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 530 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 0.4932813048362732, + "learning_rate": 0.0002, + "loss": 1.7004, + "step": 540 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 0.3493495285511017, + "learning_rate": 0.0002, + "loss": 1.679, + "step": 550 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 0.3598061800003052, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 560 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 0.3521560728549957, + "learning_rate": 0.0002, + "loss": 1.7686, + "step": 570 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 0.34150034189224243, + "learning_rate": 0.0002, + "loss": 1.7391, + "step": 580 + }, + { + "epoch": 0.9991489361702127, + "eval_loss": 1.8388911485671997, + "eval_runtime": 106.6788, + "eval_samples_per_second": 4.828, + "eval_steps_per_second": 0.609, + "step": 587 + }, + { + "epoch": 1.004255319148936, + "grad_norm": 0.33556103706359863, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 590 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 0.4333398640155792, + "learning_rate": 0.0002, + "loss": 1.8346, + "step": 600 + }, + { + "epoch": 1.0382978723404255, + "grad_norm": 0.38488736748695374, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 610 + }, + { + "epoch": 1.0553191489361702, + "grad_norm": 0.44454529881477356, + "learning_rate": 0.0002, + "loss": 1.7778, + "step": 620 + }, + { + "epoch": 1.0723404255319149, + "grad_norm": 0.3735603392124176, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 630 + }, + { + "epoch": 1.0893617021276596, + "grad_norm": 0.38912704586982727, + "learning_rate": 0.0002, + "loss": 1.74, + "step": 640 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 0.4411826431751251, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 650 + }, + { + "epoch": 1.123404255319149, + "grad_norm": 0.4163050353527069, + "learning_rate": 0.0002, + "loss": 1.6901, + "step": 660 + }, + { + "epoch": 1.1404255319148937, + "grad_norm": 0.4187192916870117, + "learning_rate": 0.0002, + "loss": 1.6722, + "step": 670 + }, + { + "epoch": 1.1574468085106382, + "grad_norm": 0.3797093629837036, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 680 + }, + { + "epoch": 1.174468085106383, + "grad_norm": 0.4210026264190674, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 690 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 0.4701998829841614, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 700 + }, + { + "epoch": 1.2085106382978723, + "grad_norm": 0.6331578493118286, + "learning_rate": 0.0002, + "loss": 1.6773, + "step": 710 + }, + { + "epoch": 1.225531914893617, + "grad_norm": 0.41908255219459534, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 720 + }, + { + "epoch": 1.2425531914893617, + "grad_norm": 0.36158403754234314, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 730 + }, + { + "epoch": 1.2595744680851064, + "grad_norm": 0.387300580739975, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 740 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 0.38899728655815125, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 750 + }, + { + "epoch": 1.2936170212765958, + "grad_norm": 0.4549255073070526, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 760 + }, + { + "epoch": 1.3106382978723405, + "grad_norm": 0.4052349328994751, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 770 + }, + { + "epoch": 1.327659574468085, + "grad_norm": 0.38934215903282166, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 780 + }, + { + "epoch": 1.3446808510638297, + "grad_norm": 0.38688382506370544, + "learning_rate": 0.0002, + "loss": 1.7813, + "step": 790 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 0.3825705051422119, + "learning_rate": 0.0002, + "loss": 1.6374, + "step": 800 + }, + { + "epoch": 1.3787234042553191, + "grad_norm": 0.37331756949424744, + "learning_rate": 0.0002, + "loss": 1.655, + "step": 810 + }, + { + "epoch": 1.3957446808510638, + "grad_norm": 0.38826408982276917, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 820 + }, + { + "epoch": 1.4127659574468086, + "grad_norm": 0.4213569164276123, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 830 + }, + { + "epoch": 1.4297872340425533, + "grad_norm": 0.3976684808731079, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 840 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 0.390009343624115, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 850 + }, + { + "epoch": 1.4638297872340424, + "grad_norm": 0.4462052583694458, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 860 + }, + { + "epoch": 1.4808510638297872, + "grad_norm": 0.42129236459732056, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 870 + }, + { + "epoch": 1.4978723404255319, + "grad_norm": 0.41489893198013306, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 880 + }, + { + "epoch": 1.5148936170212766, + "grad_norm": 0.41451677680015564, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 890 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 0.4477299749851227, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 900 + }, + { + "epoch": 1.548936170212766, + "grad_norm": 0.38476648926734924, + "learning_rate": 0.0002, + "loss": 1.6543, + "step": 910 + }, + { + "epoch": 1.5659574468085107, + "grad_norm": 0.42755743861198425, + "learning_rate": 0.0002, + "loss": 1.7103, + "step": 920 + }, + { + "epoch": 1.5829787234042554, + "grad_norm": 0.39372023940086365, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 930 + }, + { + "epoch": 1.6, + "grad_norm": 0.42778754234313965, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 940 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 0.4217268228530884, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 950 + }, + { + "epoch": 1.6340425531914895, + "grad_norm": 0.40452107787132263, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 960 + }, + { + "epoch": 1.6510638297872342, + "grad_norm": 0.4259980022907257, + "learning_rate": 0.0002, + "loss": 1.6734, + "step": 970 + }, + { + "epoch": 1.6680851063829787, + "grad_norm": 0.4089849591255188, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 980 + }, + { + "epoch": 1.6851063829787234, + "grad_norm": 0.38276049494743347, + "learning_rate": 0.0002, + "loss": 1.6588, + "step": 990 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 0.40361565351486206, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 1000 + }, + { + "epoch": 1.7191489361702128, + "grad_norm": 0.3537807762622833, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 1010 + }, + { + "epoch": 1.7361702127659573, + "grad_norm": 0.40288347005844116, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 1020 + }, + { + "epoch": 1.753191489361702, + "grad_norm": 0.4003616273403168, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1030 + }, + { + "epoch": 1.7702127659574467, + "grad_norm": 0.3931669592857361, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 1040 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 0.4001635015010834, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1050 + }, + { + "epoch": 1.804255319148936, + "grad_norm": 0.4139048457145691, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 1060 + }, + { + "epoch": 1.8212765957446808, + "grad_norm": 0.5044458508491516, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 1070 + }, + { + "epoch": 1.8382978723404255, + "grad_norm": 0.4827095568180084, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 1080 + }, + { + "epoch": 1.8553191489361702, + "grad_norm": 0.3750515282154083, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 1090 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 0.4024597704410553, + "learning_rate": 0.0002, + "loss": 1.653, + "step": 1100 + }, + { + "epoch": 1.8893617021276596, + "grad_norm": 0.36747241020202637, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1110 + }, + { + "epoch": 1.9063829787234043, + "grad_norm": 0.41397711634635925, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 1120 + }, + { + "epoch": 1.923404255319149, + "grad_norm": 0.3960763216018677, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 1130 + }, + { + "epoch": 1.9404255319148938, + "grad_norm": 0.4533233344554901, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 1140 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 0.38433438539505005, + "learning_rate": 0.0002, + "loss": 1.7168, + "step": 1150 + }, + { + "epoch": 1.974468085106383, + "grad_norm": 0.3648812174797058, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1160 + }, + { + "epoch": 1.9914893617021276, + "grad_norm": 0.3887176215648651, + "learning_rate": 0.0002, + "loss": 1.6521, + "step": 1170 + }, + { + "epoch": 2.0, + "eval_loss": 1.8328146934509277, + "eval_runtime": 107.2842, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 0.606, + "step": 1175 + } + ], + "logging_steps": 10, + "max_steps": 4696, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.43763690684416e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..40bc5c2205bcd1402ce337f3a218ef4840b3870a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:148f55f87c9f695bdf2cf6d54b37e690c06ff5da5e17bc3af6c1f44f90f45374 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9b3d23b2febf6ea2a24e7a29f532faff47fda4d6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e625adc43965e5cff2c39ce8364e34067378715d1b303b0108fcb9ee45388cb7 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e26c1d84a3c673930b8322e29889771f98e96dfe --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebcb5d905cf92d308f71277e3df0e98ca13a2d3706f571a3a027da9d749ef1a4 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b455bd084547dc1aefcc040ff9232f7863dc8948 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a17ddba17d19c81f61bec7af874761eaf8c570adf8163b0cd0621aeba5202e4 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a07820869ae6d94f6111d7646b16c35b64310b9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a39eac5f8d4cf1ddd61b029118798855663f1d2d847b6be58cbbcdb5bcf7d6b4 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7ab699ec37060a7e0ac85c22895f879e0e8492bc --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/trainer_state.json @@ -0,0 +1,1289 @@ +{ + "best_metric": 1.8328146934509277, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175", + "epoch": 2.999148936170213, + "eval_steps": 10, + "global_step": 1762, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01702127659574468, + "grad_norm": 0.7596228122711182, + "learning_rate": 0.0002, + "loss": 2.5996, + "step": 10 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 0.4860903322696686, + "learning_rate": 0.0002, + "loss": 2.2941, + "step": 20 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 0.4953401982784271, + "learning_rate": 0.0002, + "loss": 2.0018, + "step": 30 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 0.5086901783943176, + "learning_rate": 0.0002, + "loss": 1.9318, + "step": 40 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 0.49050021171569824, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 50 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 0.4922358989715576, + "learning_rate": 0.0002, + "loss": 1.8786, + "step": 60 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 0.4621541202068329, + "learning_rate": 0.0002, + "loss": 1.8812, + "step": 70 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 0.4416729807853699, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 80 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 0.526258111000061, + "learning_rate": 0.0002, + "loss": 1.9298, + "step": 90 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 0.44022637605667114, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 100 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 0.4647711515426636, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 110 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 0.4136318564414978, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 120 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 0.39707672595977783, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 130 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 0.4478105306625366, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 140 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 0.4699741303920746, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 150 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 0.4568363130092621, + "learning_rate": 0.0002, + "loss": 1.8161, + "step": 160 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 0.45078757405281067, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 170 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 0.4127245843410492, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 180 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 0.4042493402957916, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 190 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 0.401487797498703, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 200 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 0.3959457576274872, + "learning_rate": 0.0002, + "loss": 1.8355, + "step": 210 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 0.39865636825561523, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 220 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 0.7225169539451599, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 230 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 0.412801593542099, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 240 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 0.40951448678970337, + "learning_rate": 0.0002, + "loss": 1.8523, + "step": 250 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 0.42788130044937134, + "learning_rate": 0.0002, + "loss": 1.7283, + "step": 260 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 0.41069576144218445, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 270 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 0.3745323717594147, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 280 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 0.3771323263645172, + "learning_rate": 0.0002, + "loss": 1.8484, + "step": 290 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 0.34368929266929626, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 300 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 0.4299296736717224, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 310 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 0.4133922755718231, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 320 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 0.3984859585762024, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 330 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 0.3822788894176483, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 340 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 0.4550061821937561, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 350 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 0.36571192741394043, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 360 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 0.32942914962768555, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 370 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 0.39299526810646057, + "learning_rate": 0.0002, + "loss": 1.7118, + "step": 380 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 0.3817657232284546, + "learning_rate": 0.0002, + "loss": 1.8179, + "step": 390 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 0.3650810122489929, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 0.3736686408519745, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 410 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 0.45680564641952515, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 420 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 0.4154510200023651, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 430 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 0.3701167106628418, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 440 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 0.3869531750679016, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 450 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 0.4391495883464813, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 460 + }, + { + "epoch": 0.8, + "grad_norm": 0.39652755856513977, + "learning_rate": 0.0002, + "loss": 1.7534, + "step": 470 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 0.4096752107143402, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 480 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 0.3857504427433014, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 490 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 0.4105374217033386, + "learning_rate": 0.0002, + "loss": 1.7379, + "step": 500 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 0.3723328113555908, + "learning_rate": 0.0002, + "loss": 1.6391, + "step": 510 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 0.36099690198898315, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 520 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 0.3715187907218933, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 530 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 0.4932813048362732, + "learning_rate": 0.0002, + "loss": 1.7004, + "step": 540 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 0.3493495285511017, + "learning_rate": 0.0002, + "loss": 1.679, + "step": 550 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 0.3598061800003052, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 560 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 0.3521560728549957, + "learning_rate": 0.0002, + "loss": 1.7686, + "step": 570 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 0.34150034189224243, + "learning_rate": 0.0002, + "loss": 1.7391, + "step": 580 + }, + { + "epoch": 0.9991489361702127, + "eval_loss": 1.8388911485671997, + "eval_runtime": 106.6788, + "eval_samples_per_second": 4.828, + "eval_steps_per_second": 0.609, + "step": 587 + }, + { + "epoch": 1.004255319148936, + "grad_norm": 0.33556103706359863, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 590 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 0.4333398640155792, + "learning_rate": 0.0002, + "loss": 1.8346, + "step": 600 + }, + { + "epoch": 1.0382978723404255, + "grad_norm": 0.38488736748695374, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 610 + }, + { + "epoch": 1.0553191489361702, + "grad_norm": 0.44454529881477356, + "learning_rate": 0.0002, + "loss": 1.7778, + "step": 620 + }, + { + "epoch": 1.0723404255319149, + "grad_norm": 0.3735603392124176, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 630 + }, + { + "epoch": 1.0893617021276596, + "grad_norm": 0.38912704586982727, + "learning_rate": 0.0002, + "loss": 1.74, + "step": 640 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 0.4411826431751251, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 650 + }, + { + "epoch": 1.123404255319149, + "grad_norm": 0.4163050353527069, + "learning_rate": 0.0002, + "loss": 1.6901, + "step": 660 + }, + { + "epoch": 1.1404255319148937, + "grad_norm": 0.4187192916870117, + "learning_rate": 0.0002, + "loss": 1.6722, + "step": 670 + }, + { + "epoch": 1.1574468085106382, + "grad_norm": 0.3797093629837036, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 680 + }, + { + "epoch": 1.174468085106383, + "grad_norm": 0.4210026264190674, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 690 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 0.4701998829841614, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 700 + }, + { + "epoch": 1.2085106382978723, + "grad_norm": 0.6331578493118286, + "learning_rate": 0.0002, + "loss": 1.6773, + "step": 710 + }, + { + "epoch": 1.225531914893617, + "grad_norm": 0.41908255219459534, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 720 + }, + { + "epoch": 1.2425531914893617, + "grad_norm": 0.36158403754234314, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 730 + }, + { + "epoch": 1.2595744680851064, + "grad_norm": 0.387300580739975, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 740 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 0.38899728655815125, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 750 + }, + { + "epoch": 1.2936170212765958, + "grad_norm": 0.4549255073070526, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 760 + }, + { + "epoch": 1.3106382978723405, + "grad_norm": 0.4052349328994751, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 770 + }, + { + "epoch": 1.327659574468085, + "grad_norm": 0.38934215903282166, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 780 + }, + { + "epoch": 1.3446808510638297, + "grad_norm": 0.38688382506370544, + "learning_rate": 0.0002, + "loss": 1.7813, + "step": 790 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 0.3825705051422119, + "learning_rate": 0.0002, + "loss": 1.6374, + "step": 800 + }, + { + "epoch": 1.3787234042553191, + "grad_norm": 0.37331756949424744, + "learning_rate": 0.0002, + "loss": 1.655, + "step": 810 + }, + { + "epoch": 1.3957446808510638, + "grad_norm": 0.38826408982276917, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 820 + }, + { + "epoch": 1.4127659574468086, + "grad_norm": 0.4213569164276123, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 830 + }, + { + "epoch": 1.4297872340425533, + "grad_norm": 0.3976684808731079, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 840 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 0.390009343624115, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 850 + }, + { + "epoch": 1.4638297872340424, + "grad_norm": 0.4462052583694458, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 860 + }, + { + "epoch": 1.4808510638297872, + "grad_norm": 0.42129236459732056, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 870 + }, + { + "epoch": 1.4978723404255319, + "grad_norm": 0.41489893198013306, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 880 + }, + { + "epoch": 1.5148936170212766, + "grad_norm": 0.41451677680015564, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 890 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 0.4477299749851227, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 900 + }, + { + "epoch": 1.548936170212766, + "grad_norm": 0.38476648926734924, + "learning_rate": 0.0002, + "loss": 1.6543, + "step": 910 + }, + { + "epoch": 1.5659574468085107, + "grad_norm": 0.42755743861198425, + "learning_rate": 0.0002, + "loss": 1.7103, + "step": 920 + }, + { + "epoch": 1.5829787234042554, + "grad_norm": 0.39372023940086365, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 930 + }, + { + "epoch": 1.6, + "grad_norm": 0.42778754234313965, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 940 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 0.4217268228530884, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 950 + }, + { + "epoch": 1.6340425531914895, + "grad_norm": 0.40452107787132263, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 960 + }, + { + "epoch": 1.6510638297872342, + "grad_norm": 0.4259980022907257, + "learning_rate": 0.0002, + "loss": 1.6734, + "step": 970 + }, + { + "epoch": 1.6680851063829787, + "grad_norm": 0.4089849591255188, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 980 + }, + { + "epoch": 1.6851063829787234, + "grad_norm": 0.38276049494743347, + "learning_rate": 0.0002, + "loss": 1.6588, + "step": 990 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 0.40361565351486206, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 1000 + }, + { + "epoch": 1.7191489361702128, + "grad_norm": 0.3537807762622833, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 1010 + }, + { + "epoch": 1.7361702127659573, + "grad_norm": 0.40288347005844116, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 1020 + }, + { + "epoch": 1.753191489361702, + "grad_norm": 0.4003616273403168, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1030 + }, + { + "epoch": 1.7702127659574467, + "grad_norm": 0.3931669592857361, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 1040 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 0.4001635015010834, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1050 + }, + { + "epoch": 1.804255319148936, + "grad_norm": 0.4139048457145691, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 1060 + }, + { + "epoch": 1.8212765957446808, + "grad_norm": 0.5044458508491516, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 1070 + }, + { + "epoch": 1.8382978723404255, + "grad_norm": 0.4827095568180084, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 1080 + }, + { + "epoch": 1.8553191489361702, + "grad_norm": 0.3750515282154083, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 1090 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 0.4024597704410553, + "learning_rate": 0.0002, + "loss": 1.653, + "step": 1100 + }, + { + "epoch": 1.8893617021276596, + "grad_norm": 0.36747241020202637, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1110 + }, + { + "epoch": 1.9063829787234043, + "grad_norm": 0.41397711634635925, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 1120 + }, + { + "epoch": 1.923404255319149, + "grad_norm": 0.3960763216018677, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 1130 + }, + { + "epoch": 1.9404255319148938, + "grad_norm": 0.4533233344554901, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 1140 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 0.38433438539505005, + "learning_rate": 0.0002, + "loss": 1.7168, + "step": 1150 + }, + { + "epoch": 1.974468085106383, + "grad_norm": 0.3648812174797058, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1160 + }, + { + "epoch": 1.9914893617021276, + "grad_norm": 0.3887176215648651, + "learning_rate": 0.0002, + "loss": 1.6521, + "step": 1170 + }, + { + "epoch": 2.0, + "eval_loss": 1.8328146934509277, + "eval_runtime": 107.2842, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 0.606, + "step": 1175 + }, + { + "epoch": 2.008510638297872, + "grad_norm": 0.40444880723953247, + "learning_rate": 0.0002, + "loss": 1.6184, + "step": 1180 + }, + { + "epoch": 2.025531914893617, + "grad_norm": 0.3997816741466522, + "learning_rate": 0.0002, + "loss": 1.5221, + "step": 1190 + }, + { + "epoch": 2.0425531914893615, + "grad_norm": 0.4516718089580536, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 1200 + }, + { + "epoch": 2.0595744680851062, + "grad_norm": 0.6645553708076477, + "learning_rate": 0.0002, + "loss": 1.5458, + "step": 1210 + }, + { + "epoch": 2.076595744680851, + "grad_norm": 0.4181990921497345, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 1220 + }, + { + "epoch": 2.0936170212765957, + "grad_norm": 0.45681431889533997, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 1230 + }, + { + "epoch": 2.1106382978723404, + "grad_norm": 0.48914700746536255, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 1240 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 0.43265485763549805, + "learning_rate": 0.0002, + "loss": 1.6031, + "step": 1250 + }, + { + "epoch": 2.1446808510638298, + "grad_norm": 0.4641207754611969, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 1260 + }, + { + "epoch": 2.1617021276595745, + "grad_norm": 0.4840783476829529, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 1270 + }, + { + "epoch": 2.178723404255319, + "grad_norm": 0.4974595308303833, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 1280 + }, + { + "epoch": 2.195744680851064, + "grad_norm": 0.5133475661277771, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 1290 + }, + { + "epoch": 2.2127659574468086, + "grad_norm": 0.5030052065849304, + "learning_rate": 0.0002, + "loss": 1.6467, + "step": 1300 + }, + { + "epoch": 2.2297872340425533, + "grad_norm": 0.46602481603622437, + "learning_rate": 0.0002, + "loss": 1.6845, + "step": 1310 + }, + { + "epoch": 2.246808510638298, + "grad_norm": 0.43662378191947937, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1320 + }, + { + "epoch": 2.2638297872340427, + "grad_norm": 0.5137454867362976, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1330 + }, + { + "epoch": 2.2808510638297874, + "grad_norm": 0.4750335216522217, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 1340 + }, + { + "epoch": 2.297872340425532, + "grad_norm": 0.43691426515579224, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 1350 + }, + { + "epoch": 2.3148936170212764, + "grad_norm": 0.49752047657966614, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 1360 + }, + { + "epoch": 2.331914893617021, + "grad_norm": 0.45101815462112427, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 1370 + }, + { + "epoch": 2.348936170212766, + "grad_norm": 0.4427817761898041, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 1380 + }, + { + "epoch": 2.3659574468085105, + "grad_norm": 0.4802311062812805, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 1390 + }, + { + "epoch": 2.382978723404255, + "grad_norm": 0.4512513279914856, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 1400 + }, + { + "epoch": 2.4, + "grad_norm": 0.4878857135772705, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 1410 + }, + { + "epoch": 2.4170212765957446, + "grad_norm": 0.4741315543651581, + "learning_rate": 0.0002, + "loss": 1.5781, + "step": 1420 + }, + { + "epoch": 2.4340425531914893, + "grad_norm": 0.4770931601524353, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 1430 + }, + { + "epoch": 2.451063829787234, + "grad_norm": 0.5124667286872864, + "learning_rate": 0.0002, + "loss": 1.5336, + "step": 1440 + }, + { + "epoch": 2.4680851063829787, + "grad_norm": 0.45264801383018494, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 1450 + }, + { + "epoch": 2.4851063829787234, + "grad_norm": 0.5456924438476562, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 1460 + }, + { + "epoch": 2.502127659574468, + "grad_norm": 0.44656285643577576, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 1470 + }, + { + "epoch": 2.519148936170213, + "grad_norm": 0.5939419865608215, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 1480 + }, + { + "epoch": 2.5361702127659576, + "grad_norm": 0.47853362560272217, + "learning_rate": 0.0002, + "loss": 1.5481, + "step": 1490 + }, + { + "epoch": 2.5531914893617023, + "grad_norm": 0.47643396258354187, + "learning_rate": 0.0002, + "loss": 1.6543, + "step": 1500 + }, + { + "epoch": 2.570212765957447, + "grad_norm": 0.4939501881599426, + "learning_rate": 0.0002, + "loss": 1.5085, + "step": 1510 + }, + { + "epoch": 2.5872340425531917, + "grad_norm": 0.502055287361145, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 1520 + }, + { + "epoch": 2.604255319148936, + "grad_norm": 0.463250994682312, + "learning_rate": 0.0002, + "loss": 1.6519, + "step": 1530 + }, + { + "epoch": 2.621276595744681, + "grad_norm": 0.4761098623275757, + "learning_rate": 0.0002, + "loss": 1.5698, + "step": 1540 + }, + { + "epoch": 2.6382978723404253, + "grad_norm": 0.4687299132347107, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 1550 + }, + { + "epoch": 2.65531914893617, + "grad_norm": 0.5536078810691833, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1560 + }, + { + "epoch": 2.6723404255319148, + "grad_norm": 0.581320583820343, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 1570 + }, + { + "epoch": 2.6893617021276595, + "grad_norm": 0.45952868461608887, + "learning_rate": 0.0002, + "loss": 1.508, + "step": 1580 + }, + { + "epoch": 2.706382978723404, + "grad_norm": 0.4602586328983307, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 1590 + }, + { + "epoch": 2.723404255319149, + "grad_norm": 0.5276554226875305, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 1600 + }, + { + "epoch": 2.7404255319148936, + "grad_norm": 0.5750249624252319, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 1610 + }, + { + "epoch": 2.7574468085106383, + "grad_norm": 0.468723863363266, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 1620 + }, + { + "epoch": 2.774468085106383, + "grad_norm": 0.44649943709373474, + "learning_rate": 0.0002, + "loss": 1.4499, + "step": 1630 + }, + { + "epoch": 2.7914893617021277, + "grad_norm": 0.5097237825393677, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 1640 + }, + { + "epoch": 2.8085106382978724, + "grad_norm": 0.46384191513061523, + "learning_rate": 0.0002, + "loss": 1.5948, + "step": 1650 + }, + { + "epoch": 2.825531914893617, + "grad_norm": 0.4885474443435669, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 1660 + }, + { + "epoch": 2.842553191489362, + "grad_norm": 0.45621681213378906, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 1670 + }, + { + "epoch": 2.8595744680851065, + "grad_norm": 0.4797150194644928, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 1680 + }, + { + "epoch": 2.876595744680851, + "grad_norm": 0.5142032504081726, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 1690 + }, + { + "epoch": 2.8936170212765955, + "grad_norm": 0.48939862847328186, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 1700 + }, + { + "epoch": 2.9106382978723406, + "grad_norm": 0.4575578272342682, + "learning_rate": 0.0002, + "loss": 1.6333, + "step": 1710 + }, + { + "epoch": 2.927659574468085, + "grad_norm": 0.5589063763618469, + "learning_rate": 0.0002, + "loss": 1.5396, + "step": 1720 + }, + { + "epoch": 2.94468085106383, + "grad_norm": 0.48508813977241516, + "learning_rate": 0.0002, + "loss": 1.6096, + "step": 1730 + }, + { + "epoch": 2.9617021276595743, + "grad_norm": 0.42786726355552673, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 1740 + }, + { + "epoch": 2.978723404255319, + "grad_norm": 0.5598229765892029, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1750 + }, + { + "epoch": 2.9957446808510637, + "grad_norm": 0.4779253602027893, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 1760 + }, + { + "epoch": 2.999148936170213, + "eval_loss": 1.8543579578399658, + "eval_runtime": 107.2363, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 1762 + } + ], + "logging_steps": 10, + "max_steps": 4696, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.15645536026624e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..40bc5c2205bcd1402ce337f3a218ef4840b3870a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1762/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:148f55f87c9f695bdf2cf6d54b37e690c06ff5da5e17bc3af6c1f44f90f45374 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..11274d672eb1e43ad355fa4e920cdf4f42109587 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f8c8ea8ae53da58994a7d26c0eacdd7dffe564c0443d22b1a53d2ba98020022 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..addb2072ee7a4ae3ec41d441f4121ef4cb5c7ae1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1b79cdf8a89b8933ec32971cae1553c41f7e420acadd4e286dd48677cd2f7fe +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b9520321ad7b768c30a8265f41e1a07f7081a8a0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:697cf868fa13493ae0ee047d957488ed809cd960a81c08da50f320a217a8097e +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8044b3047361283fb9683cb1ae484e3b9bce2957 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e47ca3aacef7ae3438a4282122226485c9dd15ccd827ce6ec49e3a6d7d85c03e +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4191efd4f88706f6f7c6bdbc63bf0b8af628d187 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/trainer_state.json @@ -0,0 +1,1710 @@ +{ + "best_metric": 1.8328146934509277, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 2350, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01702127659574468, + "grad_norm": 0.7596228122711182, + "learning_rate": 0.0002, + "loss": 2.5996, + "step": 10 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 0.4860903322696686, + "learning_rate": 0.0002, + "loss": 2.2941, + "step": 20 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 0.4953401982784271, + "learning_rate": 0.0002, + "loss": 2.0018, + "step": 30 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 0.5086901783943176, + "learning_rate": 0.0002, + "loss": 1.9318, + "step": 40 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 0.49050021171569824, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 50 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 0.4922358989715576, + "learning_rate": 0.0002, + "loss": 1.8786, + "step": 60 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 0.4621541202068329, + "learning_rate": 0.0002, + "loss": 1.8812, + "step": 70 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 0.4416729807853699, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 80 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 0.526258111000061, + "learning_rate": 0.0002, + "loss": 1.9298, + "step": 90 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 0.44022637605667114, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 100 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 0.4647711515426636, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 110 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 0.4136318564414978, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 120 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 0.39707672595977783, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 130 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 0.4478105306625366, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 140 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 0.4699741303920746, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 150 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 0.4568363130092621, + "learning_rate": 0.0002, + "loss": 1.8161, + "step": 160 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 0.45078757405281067, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 170 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 0.4127245843410492, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 180 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 0.4042493402957916, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 190 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 0.401487797498703, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 200 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 0.3959457576274872, + "learning_rate": 0.0002, + "loss": 1.8355, + "step": 210 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 0.39865636825561523, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 220 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 0.7225169539451599, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 230 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 0.412801593542099, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 240 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 0.40951448678970337, + "learning_rate": 0.0002, + "loss": 1.8523, + "step": 250 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 0.42788130044937134, + "learning_rate": 0.0002, + "loss": 1.7283, + "step": 260 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 0.41069576144218445, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 270 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 0.3745323717594147, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 280 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 0.3771323263645172, + "learning_rate": 0.0002, + "loss": 1.8484, + "step": 290 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 0.34368929266929626, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 300 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 0.4299296736717224, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 310 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 0.4133922755718231, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 320 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 0.3984859585762024, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 330 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 0.3822788894176483, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 340 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 0.4550061821937561, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 350 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 0.36571192741394043, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 360 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 0.32942914962768555, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 370 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 0.39299526810646057, + "learning_rate": 0.0002, + "loss": 1.7118, + "step": 380 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 0.3817657232284546, + "learning_rate": 0.0002, + "loss": 1.8179, + "step": 390 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 0.3650810122489929, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 0.3736686408519745, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 410 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 0.45680564641952515, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 420 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 0.4154510200023651, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 430 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 0.3701167106628418, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 440 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 0.3869531750679016, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 450 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 0.4391495883464813, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 460 + }, + { + "epoch": 0.8, + "grad_norm": 0.39652755856513977, + "learning_rate": 0.0002, + "loss": 1.7534, + "step": 470 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 0.4096752107143402, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 480 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 0.3857504427433014, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 490 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 0.4105374217033386, + "learning_rate": 0.0002, + "loss": 1.7379, + "step": 500 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 0.3723328113555908, + "learning_rate": 0.0002, + "loss": 1.6391, + "step": 510 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 0.36099690198898315, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 520 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 0.3715187907218933, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 530 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 0.4932813048362732, + "learning_rate": 0.0002, + "loss": 1.7004, + "step": 540 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 0.3493495285511017, + "learning_rate": 0.0002, + "loss": 1.679, + "step": 550 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 0.3598061800003052, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 560 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 0.3521560728549957, + "learning_rate": 0.0002, + "loss": 1.7686, + "step": 570 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 0.34150034189224243, + "learning_rate": 0.0002, + "loss": 1.7391, + "step": 580 + }, + { + "epoch": 0.9991489361702127, + "eval_loss": 1.8388911485671997, + "eval_runtime": 106.6788, + "eval_samples_per_second": 4.828, + "eval_steps_per_second": 0.609, + "step": 587 + }, + { + "epoch": 1.004255319148936, + "grad_norm": 0.33556103706359863, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 590 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 0.4333398640155792, + "learning_rate": 0.0002, + "loss": 1.8346, + "step": 600 + }, + { + "epoch": 1.0382978723404255, + "grad_norm": 0.38488736748695374, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 610 + }, + { + "epoch": 1.0553191489361702, + "grad_norm": 0.44454529881477356, + "learning_rate": 0.0002, + "loss": 1.7778, + "step": 620 + }, + { + "epoch": 1.0723404255319149, + "grad_norm": 0.3735603392124176, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 630 + }, + { + "epoch": 1.0893617021276596, + "grad_norm": 0.38912704586982727, + "learning_rate": 0.0002, + "loss": 1.74, + "step": 640 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 0.4411826431751251, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 650 + }, + { + "epoch": 1.123404255319149, + "grad_norm": 0.4163050353527069, + "learning_rate": 0.0002, + "loss": 1.6901, + "step": 660 + }, + { + "epoch": 1.1404255319148937, + "grad_norm": 0.4187192916870117, + "learning_rate": 0.0002, + "loss": 1.6722, + "step": 670 + }, + { + "epoch": 1.1574468085106382, + "grad_norm": 0.3797093629837036, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 680 + }, + { + "epoch": 1.174468085106383, + "grad_norm": 0.4210026264190674, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 690 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 0.4701998829841614, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 700 + }, + { + "epoch": 1.2085106382978723, + "grad_norm": 0.6331578493118286, + "learning_rate": 0.0002, + "loss": 1.6773, + "step": 710 + }, + { + "epoch": 1.225531914893617, + "grad_norm": 0.41908255219459534, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 720 + }, + { + "epoch": 1.2425531914893617, + "grad_norm": 0.36158403754234314, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 730 + }, + { + "epoch": 1.2595744680851064, + "grad_norm": 0.387300580739975, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 740 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 0.38899728655815125, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 750 + }, + { + "epoch": 1.2936170212765958, + "grad_norm": 0.4549255073070526, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 760 + }, + { + "epoch": 1.3106382978723405, + "grad_norm": 0.4052349328994751, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 770 + }, + { + "epoch": 1.327659574468085, + "grad_norm": 0.38934215903282166, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 780 + }, + { + "epoch": 1.3446808510638297, + "grad_norm": 0.38688382506370544, + "learning_rate": 0.0002, + "loss": 1.7813, + "step": 790 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 0.3825705051422119, + "learning_rate": 0.0002, + "loss": 1.6374, + "step": 800 + }, + { + "epoch": 1.3787234042553191, + "grad_norm": 0.37331756949424744, + "learning_rate": 0.0002, + "loss": 1.655, + "step": 810 + }, + { + "epoch": 1.3957446808510638, + "grad_norm": 0.38826408982276917, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 820 + }, + { + "epoch": 1.4127659574468086, + "grad_norm": 0.4213569164276123, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 830 + }, + { + "epoch": 1.4297872340425533, + "grad_norm": 0.3976684808731079, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 840 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 0.390009343624115, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 850 + }, + { + "epoch": 1.4638297872340424, + "grad_norm": 0.4462052583694458, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 860 + }, + { + "epoch": 1.4808510638297872, + "grad_norm": 0.42129236459732056, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 870 + }, + { + "epoch": 1.4978723404255319, + "grad_norm": 0.41489893198013306, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 880 + }, + { + "epoch": 1.5148936170212766, + "grad_norm": 0.41451677680015564, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 890 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 0.4477299749851227, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 900 + }, + { + "epoch": 1.548936170212766, + "grad_norm": 0.38476648926734924, + "learning_rate": 0.0002, + "loss": 1.6543, + "step": 910 + }, + { + "epoch": 1.5659574468085107, + "grad_norm": 0.42755743861198425, + "learning_rate": 0.0002, + "loss": 1.7103, + "step": 920 + }, + { + "epoch": 1.5829787234042554, + "grad_norm": 0.39372023940086365, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 930 + }, + { + "epoch": 1.6, + "grad_norm": 0.42778754234313965, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 940 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 0.4217268228530884, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 950 + }, + { + "epoch": 1.6340425531914895, + "grad_norm": 0.40452107787132263, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 960 + }, + { + "epoch": 1.6510638297872342, + "grad_norm": 0.4259980022907257, + "learning_rate": 0.0002, + "loss": 1.6734, + "step": 970 + }, + { + "epoch": 1.6680851063829787, + "grad_norm": 0.4089849591255188, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 980 + }, + { + "epoch": 1.6851063829787234, + "grad_norm": 0.38276049494743347, + "learning_rate": 0.0002, + "loss": 1.6588, + "step": 990 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 0.40361565351486206, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 1000 + }, + { + "epoch": 1.7191489361702128, + "grad_norm": 0.3537807762622833, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 1010 + }, + { + "epoch": 1.7361702127659573, + "grad_norm": 0.40288347005844116, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 1020 + }, + { + "epoch": 1.753191489361702, + "grad_norm": 0.4003616273403168, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1030 + }, + { + "epoch": 1.7702127659574467, + "grad_norm": 0.3931669592857361, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 1040 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 0.4001635015010834, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1050 + }, + { + "epoch": 1.804255319148936, + "grad_norm": 0.4139048457145691, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 1060 + }, + { + "epoch": 1.8212765957446808, + "grad_norm": 0.5044458508491516, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 1070 + }, + { + "epoch": 1.8382978723404255, + "grad_norm": 0.4827095568180084, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 1080 + }, + { + "epoch": 1.8553191489361702, + "grad_norm": 0.3750515282154083, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 1090 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 0.4024597704410553, + "learning_rate": 0.0002, + "loss": 1.653, + "step": 1100 + }, + { + "epoch": 1.8893617021276596, + "grad_norm": 0.36747241020202637, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1110 + }, + { + "epoch": 1.9063829787234043, + "grad_norm": 0.41397711634635925, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 1120 + }, + { + "epoch": 1.923404255319149, + "grad_norm": 0.3960763216018677, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 1130 + }, + { + "epoch": 1.9404255319148938, + "grad_norm": 0.4533233344554901, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 1140 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 0.38433438539505005, + "learning_rate": 0.0002, + "loss": 1.7168, + "step": 1150 + }, + { + "epoch": 1.974468085106383, + "grad_norm": 0.3648812174797058, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1160 + }, + { + "epoch": 1.9914893617021276, + "grad_norm": 0.3887176215648651, + "learning_rate": 0.0002, + "loss": 1.6521, + "step": 1170 + }, + { + "epoch": 2.0, + "eval_loss": 1.8328146934509277, + "eval_runtime": 107.2842, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 0.606, + "step": 1175 + }, + { + "epoch": 2.008510638297872, + "grad_norm": 0.40444880723953247, + "learning_rate": 0.0002, + "loss": 1.6184, + "step": 1180 + }, + { + "epoch": 2.025531914893617, + "grad_norm": 0.3997816741466522, + "learning_rate": 0.0002, + "loss": 1.5221, + "step": 1190 + }, + { + "epoch": 2.0425531914893615, + "grad_norm": 0.4516718089580536, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 1200 + }, + { + "epoch": 2.0595744680851062, + "grad_norm": 0.6645553708076477, + "learning_rate": 0.0002, + "loss": 1.5458, + "step": 1210 + }, + { + "epoch": 2.076595744680851, + "grad_norm": 0.4181990921497345, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 1220 + }, + { + "epoch": 2.0936170212765957, + "grad_norm": 0.45681431889533997, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 1230 + }, + { + "epoch": 2.1106382978723404, + "grad_norm": 0.48914700746536255, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 1240 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 0.43265485763549805, + "learning_rate": 0.0002, + "loss": 1.6031, + "step": 1250 + }, + { + "epoch": 2.1446808510638298, + "grad_norm": 0.4641207754611969, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 1260 + }, + { + "epoch": 2.1617021276595745, + "grad_norm": 0.4840783476829529, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 1270 + }, + { + "epoch": 2.178723404255319, + "grad_norm": 0.4974595308303833, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 1280 + }, + { + "epoch": 2.195744680851064, + "grad_norm": 0.5133475661277771, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 1290 + }, + { + "epoch": 2.2127659574468086, + "grad_norm": 0.5030052065849304, + "learning_rate": 0.0002, + "loss": 1.6467, + "step": 1300 + }, + { + "epoch": 2.2297872340425533, + "grad_norm": 0.46602481603622437, + "learning_rate": 0.0002, + "loss": 1.6845, + "step": 1310 + }, + { + "epoch": 2.246808510638298, + "grad_norm": 0.43662378191947937, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1320 + }, + { + "epoch": 2.2638297872340427, + "grad_norm": 0.5137454867362976, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1330 + }, + { + "epoch": 2.2808510638297874, + "grad_norm": 0.4750335216522217, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 1340 + }, + { + "epoch": 2.297872340425532, + "grad_norm": 0.43691426515579224, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 1350 + }, + { + "epoch": 2.3148936170212764, + "grad_norm": 0.49752047657966614, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 1360 + }, + { + "epoch": 2.331914893617021, + "grad_norm": 0.45101815462112427, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 1370 + }, + { + "epoch": 2.348936170212766, + "grad_norm": 0.4427817761898041, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 1380 + }, + { + "epoch": 2.3659574468085105, + "grad_norm": 0.4802311062812805, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 1390 + }, + { + "epoch": 2.382978723404255, + "grad_norm": 0.4512513279914856, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 1400 + }, + { + "epoch": 2.4, + "grad_norm": 0.4878857135772705, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 1410 + }, + { + "epoch": 2.4170212765957446, + "grad_norm": 0.4741315543651581, + "learning_rate": 0.0002, + "loss": 1.5781, + "step": 1420 + }, + { + "epoch": 2.4340425531914893, + "grad_norm": 0.4770931601524353, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 1430 + }, + { + "epoch": 2.451063829787234, + "grad_norm": 0.5124667286872864, + "learning_rate": 0.0002, + "loss": 1.5336, + "step": 1440 + }, + { + "epoch": 2.4680851063829787, + "grad_norm": 0.45264801383018494, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 1450 + }, + { + "epoch": 2.4851063829787234, + "grad_norm": 0.5456924438476562, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 1460 + }, + { + "epoch": 2.502127659574468, + "grad_norm": 0.44656285643577576, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 1470 + }, + { + "epoch": 2.519148936170213, + "grad_norm": 0.5939419865608215, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 1480 + }, + { + "epoch": 2.5361702127659576, + "grad_norm": 0.47853362560272217, + "learning_rate": 0.0002, + "loss": 1.5481, + "step": 1490 + }, + { + "epoch": 2.5531914893617023, + "grad_norm": 0.47643396258354187, + "learning_rate": 0.0002, + "loss": 1.6543, + "step": 1500 + }, + { + "epoch": 2.570212765957447, + "grad_norm": 0.4939501881599426, + "learning_rate": 0.0002, + "loss": 1.5085, + "step": 1510 + }, + { + "epoch": 2.5872340425531917, + "grad_norm": 0.502055287361145, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 1520 + }, + { + "epoch": 2.604255319148936, + "grad_norm": 0.463250994682312, + "learning_rate": 0.0002, + "loss": 1.6519, + "step": 1530 + }, + { + "epoch": 2.621276595744681, + "grad_norm": 0.4761098623275757, + "learning_rate": 0.0002, + "loss": 1.5698, + "step": 1540 + }, + { + "epoch": 2.6382978723404253, + "grad_norm": 0.4687299132347107, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 1550 + }, + { + "epoch": 2.65531914893617, + "grad_norm": 0.5536078810691833, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1560 + }, + { + "epoch": 2.6723404255319148, + "grad_norm": 0.581320583820343, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 1570 + }, + { + "epoch": 2.6893617021276595, + "grad_norm": 0.45952868461608887, + "learning_rate": 0.0002, + "loss": 1.508, + "step": 1580 + }, + { + "epoch": 2.706382978723404, + "grad_norm": 0.4602586328983307, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 1590 + }, + { + "epoch": 2.723404255319149, + "grad_norm": 0.5276554226875305, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 1600 + }, + { + "epoch": 2.7404255319148936, + "grad_norm": 0.5750249624252319, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 1610 + }, + { + "epoch": 2.7574468085106383, + "grad_norm": 0.468723863363266, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 1620 + }, + { + "epoch": 2.774468085106383, + "grad_norm": 0.44649943709373474, + "learning_rate": 0.0002, + "loss": 1.4499, + "step": 1630 + }, + { + "epoch": 2.7914893617021277, + "grad_norm": 0.5097237825393677, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 1640 + }, + { + "epoch": 2.8085106382978724, + "grad_norm": 0.46384191513061523, + "learning_rate": 0.0002, + "loss": 1.5948, + "step": 1650 + }, + { + "epoch": 2.825531914893617, + "grad_norm": 0.4885474443435669, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 1660 + }, + { + "epoch": 2.842553191489362, + "grad_norm": 0.45621681213378906, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 1670 + }, + { + "epoch": 2.8595744680851065, + "grad_norm": 0.4797150194644928, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 1680 + }, + { + "epoch": 2.876595744680851, + "grad_norm": 0.5142032504081726, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 1690 + }, + { + "epoch": 2.8936170212765955, + "grad_norm": 0.48939862847328186, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 1700 + }, + { + "epoch": 2.9106382978723406, + "grad_norm": 0.4575578272342682, + "learning_rate": 0.0002, + "loss": 1.6333, + "step": 1710 + }, + { + "epoch": 2.927659574468085, + "grad_norm": 0.5589063763618469, + "learning_rate": 0.0002, + "loss": 1.5396, + "step": 1720 + }, + { + "epoch": 2.94468085106383, + "grad_norm": 0.48508813977241516, + "learning_rate": 0.0002, + "loss": 1.6096, + "step": 1730 + }, + { + "epoch": 2.9617021276595743, + "grad_norm": 0.42786726355552673, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 1740 + }, + { + "epoch": 2.978723404255319, + "grad_norm": 0.5598229765892029, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1750 + }, + { + "epoch": 2.9957446808510637, + "grad_norm": 0.4779253602027893, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 1760 + }, + { + "epoch": 2.999148936170213, + "eval_loss": 1.8543579578399658, + "eval_runtime": 107.2363, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 1762 + }, + { + "epoch": 3.0127659574468084, + "grad_norm": 0.48810940980911255, + "learning_rate": 0.0002, + "loss": 1.4767, + "step": 1770 + }, + { + "epoch": 3.029787234042553, + "grad_norm": 0.6194920539855957, + "learning_rate": 0.0002, + "loss": 1.5385, + "step": 1780 + }, + { + "epoch": 3.046808510638298, + "grad_norm": 0.5875462293624878, + "learning_rate": 0.0002, + "loss": 1.4012, + "step": 1790 + }, + { + "epoch": 3.0638297872340425, + "grad_norm": 0.5775138139724731, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 1800 + }, + { + "epoch": 3.0808510638297872, + "grad_norm": 0.5445981621742249, + "learning_rate": 0.0002, + "loss": 1.493, + "step": 1810 + }, + { + "epoch": 3.097872340425532, + "grad_norm": 0.6728862524032593, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 1820 + }, + { + "epoch": 3.1148936170212767, + "grad_norm": 0.6105490326881409, + "learning_rate": 0.0002, + "loss": 1.4303, + "step": 1830 + }, + { + "epoch": 3.1319148936170214, + "grad_norm": 0.5771165490150452, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 1840 + }, + { + "epoch": 3.148936170212766, + "grad_norm": 0.5778449773788452, + "learning_rate": 0.0002, + "loss": 1.4359, + "step": 1850 + }, + { + "epoch": 3.1659574468085108, + "grad_norm": 0.7141990661621094, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 1860 + }, + { + "epoch": 3.1829787234042555, + "grad_norm": 0.5882705450057983, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 1870 + }, + { + "epoch": 3.2, + "grad_norm": 0.5996195077896118, + "learning_rate": 0.0002, + "loss": 1.4941, + "step": 1880 + }, + { + "epoch": 3.217021276595745, + "grad_norm": 0.6121219396591187, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 1890 + }, + { + "epoch": 3.2340425531914896, + "grad_norm": 0.6402981281280518, + "learning_rate": 0.0002, + "loss": 1.4586, + "step": 1900 + }, + { + "epoch": 3.251063829787234, + "grad_norm": 0.6111783981323242, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 1910 + }, + { + "epoch": 3.2680851063829786, + "grad_norm": 0.6682435274124146, + "learning_rate": 0.0002, + "loss": 1.4863, + "step": 1920 + }, + { + "epoch": 3.2851063829787233, + "grad_norm": 0.6530760526657104, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 1930 + }, + { + "epoch": 3.302127659574468, + "grad_norm": 0.6481217741966248, + "learning_rate": 0.0002, + "loss": 1.4422, + "step": 1940 + }, + { + "epoch": 3.3191489361702127, + "grad_norm": 0.6270697116851807, + "learning_rate": 0.0002, + "loss": 1.5158, + "step": 1950 + }, + { + "epoch": 3.3361702127659574, + "grad_norm": 0.5924492478370667, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 1960 + }, + { + "epoch": 3.353191489361702, + "grad_norm": 0.5803806781768799, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 1970 + }, + { + "epoch": 3.370212765957447, + "grad_norm": 0.5754119157791138, + "learning_rate": 0.0002, + "loss": 1.4689, + "step": 1980 + }, + { + "epoch": 3.3872340425531915, + "grad_norm": 0.6717178821563721, + "learning_rate": 0.0002, + "loss": 1.4605, + "step": 1990 + }, + { + "epoch": 3.404255319148936, + "grad_norm": 0.5955582857131958, + "learning_rate": 0.0002, + "loss": 1.486, + "step": 2000 + }, + { + "epoch": 3.421276595744681, + "grad_norm": 0.6965329647064209, + "learning_rate": 0.0002, + "loss": 1.4445, + "step": 2010 + }, + { + "epoch": 3.4382978723404256, + "grad_norm": 0.6321573257446289, + "learning_rate": 0.0002, + "loss": 1.4543, + "step": 2020 + }, + { + "epoch": 3.4553191489361703, + "grad_norm": 0.5952608585357666, + "learning_rate": 0.0002, + "loss": 1.5383, + "step": 2030 + }, + { + "epoch": 3.472340425531915, + "grad_norm": 0.7718905806541443, + "learning_rate": 0.0002, + "loss": 1.4531, + "step": 2040 + }, + { + "epoch": 3.4893617021276597, + "grad_norm": 0.6850892305374146, + "learning_rate": 0.0002, + "loss": 1.4678, + "step": 2050 + }, + { + "epoch": 3.506382978723404, + "grad_norm": 0.5638895630836487, + "learning_rate": 0.0002, + "loss": 1.4956, + "step": 2060 + }, + { + "epoch": 3.523404255319149, + "grad_norm": 0.6148294806480408, + "learning_rate": 0.0002, + "loss": 1.4586, + "step": 2070 + }, + { + "epoch": 3.5404255319148934, + "grad_norm": 0.5895810723304749, + "learning_rate": 0.0002, + "loss": 1.4622, + "step": 2080 + }, + { + "epoch": 3.5574468085106385, + "grad_norm": 0.6377319693565369, + "learning_rate": 0.0002, + "loss": 1.4341, + "step": 2090 + }, + { + "epoch": 3.574468085106383, + "grad_norm": 0.6047691702842712, + "learning_rate": 0.0002, + "loss": 1.5056, + "step": 2100 + }, + { + "epoch": 3.5914893617021275, + "grad_norm": 0.6049593687057495, + "learning_rate": 0.0002, + "loss": 1.4748, + "step": 2110 + }, + { + "epoch": 3.608510638297872, + "grad_norm": 0.6358312368392944, + "learning_rate": 0.0002, + "loss": 1.391, + "step": 2120 + }, + { + "epoch": 3.625531914893617, + "grad_norm": 0.612119197845459, + "learning_rate": 0.0002, + "loss": 1.4419, + "step": 2130 + }, + { + "epoch": 3.6425531914893616, + "grad_norm": 0.6788054704666138, + "learning_rate": 0.0002, + "loss": 1.438, + "step": 2140 + }, + { + "epoch": 3.6595744680851063, + "grad_norm": 0.6191043853759766, + "learning_rate": 0.0002, + "loss": 1.4295, + "step": 2150 + }, + { + "epoch": 3.676595744680851, + "grad_norm": 0.6660051941871643, + "learning_rate": 0.0002, + "loss": 1.4383, + "step": 2160 + }, + { + "epoch": 3.6936170212765957, + "grad_norm": 0.652692973613739, + "learning_rate": 0.0002, + "loss": 1.4954, + "step": 2170 + }, + { + "epoch": 3.7106382978723405, + "grad_norm": 0.6123467087745667, + "learning_rate": 0.0002, + "loss": 1.5245, + "step": 2180 + }, + { + "epoch": 3.727659574468085, + "grad_norm": 0.640021562576294, + "learning_rate": 0.0002, + "loss": 1.4686, + "step": 2190 + }, + { + "epoch": 3.74468085106383, + "grad_norm": 0.6809179782867432, + "learning_rate": 0.0002, + "loss": 1.4277, + "step": 2200 + }, + { + "epoch": 3.7617021276595746, + "grad_norm": 0.5978420376777649, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 2210 + }, + { + "epoch": 3.7787234042553193, + "grad_norm": 0.7038803100585938, + "learning_rate": 0.0002, + "loss": 1.5559, + "step": 2220 + }, + { + "epoch": 3.795744680851064, + "grad_norm": 0.5324276089668274, + "learning_rate": 0.0002, + "loss": 1.4691, + "step": 2230 + }, + { + "epoch": 3.8127659574468087, + "grad_norm": 0.6264132857322693, + "learning_rate": 0.0002, + "loss": 1.4696, + "step": 2240 + }, + { + "epoch": 3.829787234042553, + "grad_norm": 0.6143888831138611, + "learning_rate": 0.0002, + "loss": 1.4856, + "step": 2250 + }, + { + "epoch": 3.846808510638298, + "grad_norm": 0.6338503360748291, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 2260 + }, + { + "epoch": 3.8638297872340424, + "grad_norm": 0.556882381439209, + "learning_rate": 0.0002, + "loss": 1.456, + "step": 2270 + }, + { + "epoch": 3.8808510638297875, + "grad_norm": 0.6323680281639099, + "learning_rate": 0.0002, + "loss": 1.4701, + "step": 2280 + }, + { + "epoch": 3.8978723404255318, + "grad_norm": 0.7105869054794312, + "learning_rate": 0.0002, + "loss": 1.5333, + "step": 2290 + }, + { + "epoch": 3.9148936170212765, + "grad_norm": 0.825415849685669, + "learning_rate": 0.0002, + "loss": 1.4462, + "step": 2300 + }, + { + "epoch": 3.931914893617021, + "grad_norm": 0.6412091851234436, + "learning_rate": 0.0002, + "loss": 1.5023, + "step": 2310 + }, + { + "epoch": 3.948936170212766, + "grad_norm": 0.6286490559577942, + "learning_rate": 0.0002, + "loss": 1.3709, + "step": 2320 + }, + { + "epoch": 3.9659574468085106, + "grad_norm": 0.636021077632904, + "learning_rate": 0.0002, + "loss": 1.4693, + "step": 2330 + }, + { + "epoch": 3.9829787234042553, + "grad_norm": 0.6032362580299377, + "learning_rate": 0.0002, + "loss": 1.4265, + "step": 2340 + }, + { + "epoch": 4.0, + "grad_norm": 0.6497282385826111, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 2350 + }, + { + "epoch": 4.0, + "eval_loss": 1.9081238508224487, + "eval_runtime": 106.6404, + "eval_samples_per_second": 4.829, + "eval_steps_per_second": 0.61, + "step": 2350 + } + ], + "logging_steps": 10, + "max_steps": 4696, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.087527381368832e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..40bc5c2205bcd1402ce337f3a218ef4840b3870a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2350/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:148f55f87c9f695bdf2cf6d54b37e690c06ff5da5e17bc3af6c1f44f90f45374 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3ecb710937af1cf967225df96c250bb9d28f5b5b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9641c9619a61a110d772b5e525e33255f1e8b6ca5e732a5d7d19c488e5b429cb +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d9f3bc44fd58d9699e8fb7db686fcfd0d06694a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:609607b52c8150cf7e3821c560ea35d0fffa0e82aba557e93b77f549444fd338 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..62d313ae8e27126c507de43c402b1ccc1d8afc49 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c5fae1d6cac091e4f5647e460dc276b096409f1fc50a3df6a98a270354f1f7b +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfd4313220139c330ad8663fa8918d0d330c68e9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46642390a665c7cb165dcb0ac50a980d0d684841c635ed6f7bf44f1147872f99 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d3f5b2a44f53735bea8f3fa842bd9a93aa80bc55 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/trainer_state.json @@ -0,0 +1,2124 @@ +{ + "best_metric": 1.8328146934509277, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175", + "epoch": 4.999148936170212, + "eval_steps": 10, + "global_step": 2937, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01702127659574468, + "grad_norm": 0.7596228122711182, + "learning_rate": 0.0002, + "loss": 2.5996, + "step": 10 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 0.4860903322696686, + "learning_rate": 0.0002, + "loss": 2.2941, + "step": 20 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 0.4953401982784271, + "learning_rate": 0.0002, + "loss": 2.0018, + "step": 30 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 0.5086901783943176, + "learning_rate": 0.0002, + "loss": 1.9318, + "step": 40 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 0.49050021171569824, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 50 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 0.4922358989715576, + "learning_rate": 0.0002, + "loss": 1.8786, + "step": 60 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 0.4621541202068329, + "learning_rate": 0.0002, + "loss": 1.8812, + "step": 70 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 0.4416729807853699, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 80 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 0.526258111000061, + "learning_rate": 0.0002, + "loss": 1.9298, + "step": 90 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 0.44022637605667114, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 100 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 0.4647711515426636, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 110 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 0.4136318564414978, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 120 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 0.39707672595977783, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 130 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 0.4478105306625366, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 140 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 0.4699741303920746, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 150 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 0.4568363130092621, + "learning_rate": 0.0002, + "loss": 1.8161, + "step": 160 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 0.45078757405281067, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 170 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 0.4127245843410492, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 180 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 0.4042493402957916, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 190 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 0.401487797498703, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 200 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 0.3959457576274872, + "learning_rate": 0.0002, + "loss": 1.8355, + "step": 210 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 0.39865636825561523, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 220 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 0.7225169539451599, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 230 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 0.412801593542099, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 240 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 0.40951448678970337, + "learning_rate": 0.0002, + "loss": 1.8523, + "step": 250 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 0.42788130044937134, + "learning_rate": 0.0002, + "loss": 1.7283, + "step": 260 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 0.41069576144218445, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 270 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 0.3745323717594147, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 280 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 0.3771323263645172, + "learning_rate": 0.0002, + "loss": 1.8484, + "step": 290 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 0.34368929266929626, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 300 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 0.4299296736717224, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 310 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 0.4133922755718231, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 320 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 0.3984859585762024, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 330 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 0.3822788894176483, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 340 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 0.4550061821937561, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 350 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 0.36571192741394043, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 360 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 0.32942914962768555, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 370 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 0.39299526810646057, + "learning_rate": 0.0002, + "loss": 1.7118, + "step": 380 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 0.3817657232284546, + "learning_rate": 0.0002, + "loss": 1.8179, + "step": 390 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 0.3650810122489929, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 0.3736686408519745, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 410 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 0.45680564641952515, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 420 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 0.4154510200023651, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 430 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 0.3701167106628418, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 440 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 0.3869531750679016, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 450 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 0.4391495883464813, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 460 + }, + { + "epoch": 0.8, + "grad_norm": 0.39652755856513977, + "learning_rate": 0.0002, + "loss": 1.7534, + "step": 470 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 0.4096752107143402, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 480 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 0.3857504427433014, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 490 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 0.4105374217033386, + "learning_rate": 0.0002, + "loss": 1.7379, + "step": 500 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 0.3723328113555908, + "learning_rate": 0.0002, + "loss": 1.6391, + "step": 510 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 0.36099690198898315, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 520 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 0.3715187907218933, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 530 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 0.4932813048362732, + "learning_rate": 0.0002, + "loss": 1.7004, + "step": 540 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 0.3493495285511017, + "learning_rate": 0.0002, + "loss": 1.679, + "step": 550 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 0.3598061800003052, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 560 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 0.3521560728549957, + "learning_rate": 0.0002, + "loss": 1.7686, + "step": 570 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 0.34150034189224243, + "learning_rate": 0.0002, + "loss": 1.7391, + "step": 580 + }, + { + "epoch": 0.9991489361702127, + "eval_loss": 1.8388911485671997, + "eval_runtime": 106.6788, + "eval_samples_per_second": 4.828, + "eval_steps_per_second": 0.609, + "step": 587 + }, + { + "epoch": 1.004255319148936, + "grad_norm": 0.33556103706359863, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 590 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 0.4333398640155792, + "learning_rate": 0.0002, + "loss": 1.8346, + "step": 600 + }, + { + "epoch": 1.0382978723404255, + "grad_norm": 0.38488736748695374, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 610 + }, + { + "epoch": 1.0553191489361702, + "grad_norm": 0.44454529881477356, + "learning_rate": 0.0002, + "loss": 1.7778, + "step": 620 + }, + { + "epoch": 1.0723404255319149, + "grad_norm": 0.3735603392124176, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 630 + }, + { + "epoch": 1.0893617021276596, + "grad_norm": 0.38912704586982727, + "learning_rate": 0.0002, + "loss": 1.74, + "step": 640 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 0.4411826431751251, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 650 + }, + { + "epoch": 1.123404255319149, + "grad_norm": 0.4163050353527069, + "learning_rate": 0.0002, + "loss": 1.6901, + "step": 660 + }, + { + "epoch": 1.1404255319148937, + "grad_norm": 0.4187192916870117, + "learning_rate": 0.0002, + "loss": 1.6722, + "step": 670 + }, + { + "epoch": 1.1574468085106382, + "grad_norm": 0.3797093629837036, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 680 + }, + { + "epoch": 1.174468085106383, + "grad_norm": 0.4210026264190674, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 690 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 0.4701998829841614, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 700 + }, + { + "epoch": 1.2085106382978723, + "grad_norm": 0.6331578493118286, + "learning_rate": 0.0002, + "loss": 1.6773, + "step": 710 + }, + { + "epoch": 1.225531914893617, + "grad_norm": 0.41908255219459534, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 720 + }, + { + "epoch": 1.2425531914893617, + "grad_norm": 0.36158403754234314, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 730 + }, + { + "epoch": 1.2595744680851064, + "grad_norm": 0.387300580739975, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 740 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 0.38899728655815125, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 750 + }, + { + "epoch": 1.2936170212765958, + "grad_norm": 0.4549255073070526, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 760 + }, + { + "epoch": 1.3106382978723405, + "grad_norm": 0.4052349328994751, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 770 + }, + { + "epoch": 1.327659574468085, + "grad_norm": 0.38934215903282166, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 780 + }, + { + "epoch": 1.3446808510638297, + "grad_norm": 0.38688382506370544, + "learning_rate": 0.0002, + "loss": 1.7813, + "step": 790 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 0.3825705051422119, + "learning_rate": 0.0002, + "loss": 1.6374, + "step": 800 + }, + { + "epoch": 1.3787234042553191, + "grad_norm": 0.37331756949424744, + "learning_rate": 0.0002, + "loss": 1.655, + "step": 810 + }, + { + "epoch": 1.3957446808510638, + "grad_norm": 0.38826408982276917, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 820 + }, + { + "epoch": 1.4127659574468086, + "grad_norm": 0.4213569164276123, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 830 + }, + { + "epoch": 1.4297872340425533, + "grad_norm": 0.3976684808731079, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 840 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 0.390009343624115, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 850 + }, + { + "epoch": 1.4638297872340424, + "grad_norm": 0.4462052583694458, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 860 + }, + { + "epoch": 1.4808510638297872, + "grad_norm": 0.42129236459732056, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 870 + }, + { + "epoch": 1.4978723404255319, + "grad_norm": 0.41489893198013306, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 880 + }, + { + "epoch": 1.5148936170212766, + "grad_norm": 0.41451677680015564, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 890 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 0.4477299749851227, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 900 + }, + { + "epoch": 1.548936170212766, + "grad_norm": 0.38476648926734924, + "learning_rate": 0.0002, + "loss": 1.6543, + "step": 910 + }, + { + "epoch": 1.5659574468085107, + "grad_norm": 0.42755743861198425, + "learning_rate": 0.0002, + "loss": 1.7103, + "step": 920 + }, + { + "epoch": 1.5829787234042554, + "grad_norm": 0.39372023940086365, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 930 + }, + { + "epoch": 1.6, + "grad_norm": 0.42778754234313965, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 940 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 0.4217268228530884, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 950 + }, + { + "epoch": 1.6340425531914895, + "grad_norm": 0.40452107787132263, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 960 + }, + { + "epoch": 1.6510638297872342, + "grad_norm": 0.4259980022907257, + "learning_rate": 0.0002, + "loss": 1.6734, + "step": 970 + }, + { + "epoch": 1.6680851063829787, + "grad_norm": 0.4089849591255188, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 980 + }, + { + "epoch": 1.6851063829787234, + "grad_norm": 0.38276049494743347, + "learning_rate": 0.0002, + "loss": 1.6588, + "step": 990 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 0.40361565351486206, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 1000 + }, + { + "epoch": 1.7191489361702128, + "grad_norm": 0.3537807762622833, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 1010 + }, + { + "epoch": 1.7361702127659573, + "grad_norm": 0.40288347005844116, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 1020 + }, + { + "epoch": 1.753191489361702, + "grad_norm": 0.4003616273403168, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1030 + }, + { + "epoch": 1.7702127659574467, + "grad_norm": 0.3931669592857361, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 1040 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 0.4001635015010834, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1050 + }, + { + "epoch": 1.804255319148936, + "grad_norm": 0.4139048457145691, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 1060 + }, + { + "epoch": 1.8212765957446808, + "grad_norm": 0.5044458508491516, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 1070 + }, + { + "epoch": 1.8382978723404255, + "grad_norm": 0.4827095568180084, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 1080 + }, + { + "epoch": 1.8553191489361702, + "grad_norm": 0.3750515282154083, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 1090 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 0.4024597704410553, + "learning_rate": 0.0002, + "loss": 1.653, + "step": 1100 + }, + { + "epoch": 1.8893617021276596, + "grad_norm": 0.36747241020202637, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1110 + }, + { + "epoch": 1.9063829787234043, + "grad_norm": 0.41397711634635925, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 1120 + }, + { + "epoch": 1.923404255319149, + "grad_norm": 0.3960763216018677, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 1130 + }, + { + "epoch": 1.9404255319148938, + "grad_norm": 0.4533233344554901, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 1140 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 0.38433438539505005, + "learning_rate": 0.0002, + "loss": 1.7168, + "step": 1150 + }, + { + "epoch": 1.974468085106383, + "grad_norm": 0.3648812174797058, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1160 + }, + { + "epoch": 1.9914893617021276, + "grad_norm": 0.3887176215648651, + "learning_rate": 0.0002, + "loss": 1.6521, + "step": 1170 + }, + { + "epoch": 2.0, + "eval_loss": 1.8328146934509277, + "eval_runtime": 107.2842, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 0.606, + "step": 1175 + }, + { + "epoch": 2.008510638297872, + "grad_norm": 0.40444880723953247, + "learning_rate": 0.0002, + "loss": 1.6184, + "step": 1180 + }, + { + "epoch": 2.025531914893617, + "grad_norm": 0.3997816741466522, + "learning_rate": 0.0002, + "loss": 1.5221, + "step": 1190 + }, + { + "epoch": 2.0425531914893615, + "grad_norm": 0.4516718089580536, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 1200 + }, + { + "epoch": 2.0595744680851062, + "grad_norm": 0.6645553708076477, + "learning_rate": 0.0002, + "loss": 1.5458, + "step": 1210 + }, + { + "epoch": 2.076595744680851, + "grad_norm": 0.4181990921497345, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 1220 + }, + { + "epoch": 2.0936170212765957, + "grad_norm": 0.45681431889533997, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 1230 + }, + { + "epoch": 2.1106382978723404, + "grad_norm": 0.48914700746536255, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 1240 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 0.43265485763549805, + "learning_rate": 0.0002, + "loss": 1.6031, + "step": 1250 + }, + { + "epoch": 2.1446808510638298, + "grad_norm": 0.4641207754611969, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 1260 + }, + { + "epoch": 2.1617021276595745, + "grad_norm": 0.4840783476829529, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 1270 + }, + { + "epoch": 2.178723404255319, + "grad_norm": 0.4974595308303833, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 1280 + }, + { + "epoch": 2.195744680851064, + "grad_norm": 0.5133475661277771, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 1290 + }, + { + "epoch": 2.2127659574468086, + "grad_norm": 0.5030052065849304, + "learning_rate": 0.0002, + "loss": 1.6467, + "step": 1300 + }, + { + "epoch": 2.2297872340425533, + "grad_norm": 0.46602481603622437, + "learning_rate": 0.0002, + "loss": 1.6845, + "step": 1310 + }, + { + "epoch": 2.246808510638298, + "grad_norm": 0.43662378191947937, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1320 + }, + { + "epoch": 2.2638297872340427, + "grad_norm": 0.5137454867362976, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1330 + }, + { + "epoch": 2.2808510638297874, + "grad_norm": 0.4750335216522217, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 1340 + }, + { + "epoch": 2.297872340425532, + "grad_norm": 0.43691426515579224, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 1350 + }, + { + "epoch": 2.3148936170212764, + "grad_norm": 0.49752047657966614, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 1360 + }, + { + "epoch": 2.331914893617021, + "grad_norm": 0.45101815462112427, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 1370 + }, + { + "epoch": 2.348936170212766, + "grad_norm": 0.4427817761898041, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 1380 + }, + { + "epoch": 2.3659574468085105, + "grad_norm": 0.4802311062812805, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 1390 + }, + { + "epoch": 2.382978723404255, + "grad_norm": 0.4512513279914856, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 1400 + }, + { + "epoch": 2.4, + "grad_norm": 0.4878857135772705, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 1410 + }, + { + "epoch": 2.4170212765957446, + "grad_norm": 0.4741315543651581, + "learning_rate": 0.0002, + "loss": 1.5781, + "step": 1420 + }, + { + "epoch": 2.4340425531914893, + "grad_norm": 0.4770931601524353, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 1430 + }, + { + "epoch": 2.451063829787234, + "grad_norm": 0.5124667286872864, + "learning_rate": 0.0002, + "loss": 1.5336, + "step": 1440 + }, + { + "epoch": 2.4680851063829787, + "grad_norm": 0.45264801383018494, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 1450 + }, + { + "epoch": 2.4851063829787234, + "grad_norm": 0.5456924438476562, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 1460 + }, + { + "epoch": 2.502127659574468, + "grad_norm": 0.44656285643577576, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 1470 + }, + { + "epoch": 2.519148936170213, + "grad_norm": 0.5939419865608215, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 1480 + }, + { + "epoch": 2.5361702127659576, + "grad_norm": 0.47853362560272217, + "learning_rate": 0.0002, + "loss": 1.5481, + "step": 1490 + }, + { + "epoch": 2.5531914893617023, + "grad_norm": 0.47643396258354187, + "learning_rate": 0.0002, + "loss": 1.6543, + "step": 1500 + }, + { + "epoch": 2.570212765957447, + "grad_norm": 0.4939501881599426, + "learning_rate": 0.0002, + "loss": 1.5085, + "step": 1510 + }, + { + "epoch": 2.5872340425531917, + "grad_norm": 0.502055287361145, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 1520 + }, + { + "epoch": 2.604255319148936, + "grad_norm": 0.463250994682312, + "learning_rate": 0.0002, + "loss": 1.6519, + "step": 1530 + }, + { + "epoch": 2.621276595744681, + "grad_norm": 0.4761098623275757, + "learning_rate": 0.0002, + "loss": 1.5698, + "step": 1540 + }, + { + "epoch": 2.6382978723404253, + "grad_norm": 0.4687299132347107, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 1550 + }, + { + "epoch": 2.65531914893617, + "grad_norm": 0.5536078810691833, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1560 + }, + { + "epoch": 2.6723404255319148, + "grad_norm": 0.581320583820343, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 1570 + }, + { + "epoch": 2.6893617021276595, + "grad_norm": 0.45952868461608887, + "learning_rate": 0.0002, + "loss": 1.508, + "step": 1580 + }, + { + "epoch": 2.706382978723404, + "grad_norm": 0.4602586328983307, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 1590 + }, + { + "epoch": 2.723404255319149, + "grad_norm": 0.5276554226875305, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 1600 + }, + { + "epoch": 2.7404255319148936, + "grad_norm": 0.5750249624252319, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 1610 + }, + { + "epoch": 2.7574468085106383, + "grad_norm": 0.468723863363266, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 1620 + }, + { + "epoch": 2.774468085106383, + "grad_norm": 0.44649943709373474, + "learning_rate": 0.0002, + "loss": 1.4499, + "step": 1630 + }, + { + "epoch": 2.7914893617021277, + "grad_norm": 0.5097237825393677, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 1640 + }, + { + "epoch": 2.8085106382978724, + "grad_norm": 0.46384191513061523, + "learning_rate": 0.0002, + "loss": 1.5948, + "step": 1650 + }, + { + "epoch": 2.825531914893617, + "grad_norm": 0.4885474443435669, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 1660 + }, + { + "epoch": 2.842553191489362, + "grad_norm": 0.45621681213378906, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 1670 + }, + { + "epoch": 2.8595744680851065, + "grad_norm": 0.4797150194644928, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 1680 + }, + { + "epoch": 2.876595744680851, + "grad_norm": 0.5142032504081726, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 1690 + }, + { + "epoch": 2.8936170212765955, + "grad_norm": 0.48939862847328186, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 1700 + }, + { + "epoch": 2.9106382978723406, + "grad_norm": 0.4575578272342682, + "learning_rate": 0.0002, + "loss": 1.6333, + "step": 1710 + }, + { + "epoch": 2.927659574468085, + "grad_norm": 0.5589063763618469, + "learning_rate": 0.0002, + "loss": 1.5396, + "step": 1720 + }, + { + "epoch": 2.94468085106383, + "grad_norm": 0.48508813977241516, + "learning_rate": 0.0002, + "loss": 1.6096, + "step": 1730 + }, + { + "epoch": 2.9617021276595743, + "grad_norm": 0.42786726355552673, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 1740 + }, + { + "epoch": 2.978723404255319, + "grad_norm": 0.5598229765892029, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1750 + }, + { + "epoch": 2.9957446808510637, + "grad_norm": 0.4779253602027893, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 1760 + }, + { + "epoch": 2.999148936170213, + "eval_loss": 1.8543579578399658, + "eval_runtime": 107.2363, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 1762 + }, + { + "epoch": 3.0127659574468084, + "grad_norm": 0.48810940980911255, + "learning_rate": 0.0002, + "loss": 1.4767, + "step": 1770 + }, + { + "epoch": 3.029787234042553, + "grad_norm": 0.6194920539855957, + "learning_rate": 0.0002, + "loss": 1.5385, + "step": 1780 + }, + { + "epoch": 3.046808510638298, + "grad_norm": 0.5875462293624878, + "learning_rate": 0.0002, + "loss": 1.4012, + "step": 1790 + }, + { + "epoch": 3.0638297872340425, + "grad_norm": 0.5775138139724731, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 1800 + }, + { + "epoch": 3.0808510638297872, + "grad_norm": 0.5445981621742249, + "learning_rate": 0.0002, + "loss": 1.493, + "step": 1810 + }, + { + "epoch": 3.097872340425532, + "grad_norm": 0.6728862524032593, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 1820 + }, + { + "epoch": 3.1148936170212767, + "grad_norm": 0.6105490326881409, + "learning_rate": 0.0002, + "loss": 1.4303, + "step": 1830 + }, + { + "epoch": 3.1319148936170214, + "grad_norm": 0.5771165490150452, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 1840 + }, + { + "epoch": 3.148936170212766, + "grad_norm": 0.5778449773788452, + "learning_rate": 0.0002, + "loss": 1.4359, + "step": 1850 + }, + { + "epoch": 3.1659574468085108, + "grad_norm": 0.7141990661621094, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 1860 + }, + { + "epoch": 3.1829787234042555, + "grad_norm": 0.5882705450057983, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 1870 + }, + { + "epoch": 3.2, + "grad_norm": 0.5996195077896118, + "learning_rate": 0.0002, + "loss": 1.4941, + "step": 1880 + }, + { + "epoch": 3.217021276595745, + "grad_norm": 0.6121219396591187, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 1890 + }, + { + "epoch": 3.2340425531914896, + "grad_norm": 0.6402981281280518, + "learning_rate": 0.0002, + "loss": 1.4586, + "step": 1900 + }, + { + "epoch": 3.251063829787234, + "grad_norm": 0.6111783981323242, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 1910 + }, + { + "epoch": 3.2680851063829786, + "grad_norm": 0.6682435274124146, + "learning_rate": 0.0002, + "loss": 1.4863, + "step": 1920 + }, + { + "epoch": 3.2851063829787233, + "grad_norm": 0.6530760526657104, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 1930 + }, + { + "epoch": 3.302127659574468, + "grad_norm": 0.6481217741966248, + "learning_rate": 0.0002, + "loss": 1.4422, + "step": 1940 + }, + { + "epoch": 3.3191489361702127, + "grad_norm": 0.6270697116851807, + "learning_rate": 0.0002, + "loss": 1.5158, + "step": 1950 + }, + { + "epoch": 3.3361702127659574, + "grad_norm": 0.5924492478370667, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 1960 + }, + { + "epoch": 3.353191489361702, + "grad_norm": 0.5803806781768799, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 1970 + }, + { + "epoch": 3.370212765957447, + "grad_norm": 0.5754119157791138, + "learning_rate": 0.0002, + "loss": 1.4689, + "step": 1980 + }, + { + "epoch": 3.3872340425531915, + "grad_norm": 0.6717178821563721, + "learning_rate": 0.0002, + "loss": 1.4605, + "step": 1990 + }, + { + "epoch": 3.404255319148936, + "grad_norm": 0.5955582857131958, + "learning_rate": 0.0002, + "loss": 1.486, + "step": 2000 + }, + { + "epoch": 3.421276595744681, + "grad_norm": 0.6965329647064209, + "learning_rate": 0.0002, + "loss": 1.4445, + "step": 2010 + }, + { + "epoch": 3.4382978723404256, + "grad_norm": 0.6321573257446289, + "learning_rate": 0.0002, + "loss": 1.4543, + "step": 2020 + }, + { + "epoch": 3.4553191489361703, + "grad_norm": 0.5952608585357666, + "learning_rate": 0.0002, + "loss": 1.5383, + "step": 2030 + }, + { + "epoch": 3.472340425531915, + "grad_norm": 0.7718905806541443, + "learning_rate": 0.0002, + "loss": 1.4531, + "step": 2040 + }, + { + "epoch": 3.4893617021276597, + "grad_norm": 0.6850892305374146, + "learning_rate": 0.0002, + "loss": 1.4678, + "step": 2050 + }, + { + "epoch": 3.506382978723404, + "grad_norm": 0.5638895630836487, + "learning_rate": 0.0002, + "loss": 1.4956, + "step": 2060 + }, + { + "epoch": 3.523404255319149, + "grad_norm": 0.6148294806480408, + "learning_rate": 0.0002, + "loss": 1.4586, + "step": 2070 + }, + { + "epoch": 3.5404255319148934, + "grad_norm": 0.5895810723304749, + "learning_rate": 0.0002, + "loss": 1.4622, + "step": 2080 + }, + { + "epoch": 3.5574468085106385, + "grad_norm": 0.6377319693565369, + "learning_rate": 0.0002, + "loss": 1.4341, + "step": 2090 + }, + { + "epoch": 3.574468085106383, + "grad_norm": 0.6047691702842712, + "learning_rate": 0.0002, + "loss": 1.5056, + "step": 2100 + }, + { + "epoch": 3.5914893617021275, + "grad_norm": 0.6049593687057495, + "learning_rate": 0.0002, + "loss": 1.4748, + "step": 2110 + }, + { + "epoch": 3.608510638297872, + "grad_norm": 0.6358312368392944, + "learning_rate": 0.0002, + "loss": 1.391, + "step": 2120 + }, + { + "epoch": 3.625531914893617, + "grad_norm": 0.612119197845459, + "learning_rate": 0.0002, + "loss": 1.4419, + "step": 2130 + }, + { + "epoch": 3.6425531914893616, + "grad_norm": 0.6788054704666138, + "learning_rate": 0.0002, + "loss": 1.438, + "step": 2140 + }, + { + "epoch": 3.6595744680851063, + "grad_norm": 0.6191043853759766, + "learning_rate": 0.0002, + "loss": 1.4295, + "step": 2150 + }, + { + "epoch": 3.676595744680851, + "grad_norm": 0.6660051941871643, + "learning_rate": 0.0002, + "loss": 1.4383, + "step": 2160 + }, + { + "epoch": 3.6936170212765957, + "grad_norm": 0.652692973613739, + "learning_rate": 0.0002, + "loss": 1.4954, + "step": 2170 + }, + { + "epoch": 3.7106382978723405, + "grad_norm": 0.6123467087745667, + "learning_rate": 0.0002, + "loss": 1.5245, + "step": 2180 + }, + { + "epoch": 3.727659574468085, + "grad_norm": 0.640021562576294, + "learning_rate": 0.0002, + "loss": 1.4686, + "step": 2190 + }, + { + "epoch": 3.74468085106383, + "grad_norm": 0.6809179782867432, + "learning_rate": 0.0002, + "loss": 1.4277, + "step": 2200 + }, + { + "epoch": 3.7617021276595746, + "grad_norm": 0.5978420376777649, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 2210 + }, + { + "epoch": 3.7787234042553193, + "grad_norm": 0.7038803100585938, + "learning_rate": 0.0002, + "loss": 1.5559, + "step": 2220 + }, + { + "epoch": 3.795744680851064, + "grad_norm": 0.5324276089668274, + "learning_rate": 0.0002, + "loss": 1.4691, + "step": 2230 + }, + { + "epoch": 3.8127659574468087, + "grad_norm": 0.6264132857322693, + "learning_rate": 0.0002, + "loss": 1.4696, + "step": 2240 + }, + { + "epoch": 3.829787234042553, + "grad_norm": 0.6143888831138611, + "learning_rate": 0.0002, + "loss": 1.4856, + "step": 2250 + }, + { + "epoch": 3.846808510638298, + "grad_norm": 0.6338503360748291, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 2260 + }, + { + "epoch": 3.8638297872340424, + "grad_norm": 0.556882381439209, + "learning_rate": 0.0002, + "loss": 1.456, + "step": 2270 + }, + { + "epoch": 3.8808510638297875, + "grad_norm": 0.6323680281639099, + "learning_rate": 0.0002, + "loss": 1.4701, + "step": 2280 + }, + { + "epoch": 3.8978723404255318, + "grad_norm": 0.7105869054794312, + "learning_rate": 0.0002, + "loss": 1.5333, + "step": 2290 + }, + { + "epoch": 3.9148936170212765, + "grad_norm": 0.825415849685669, + "learning_rate": 0.0002, + "loss": 1.4462, + "step": 2300 + }, + { + "epoch": 3.931914893617021, + "grad_norm": 0.6412091851234436, + "learning_rate": 0.0002, + "loss": 1.5023, + "step": 2310 + }, + { + "epoch": 3.948936170212766, + "grad_norm": 0.6286490559577942, + "learning_rate": 0.0002, + "loss": 1.3709, + "step": 2320 + }, + { + "epoch": 3.9659574468085106, + "grad_norm": 0.636021077632904, + "learning_rate": 0.0002, + "loss": 1.4693, + "step": 2330 + }, + { + "epoch": 3.9829787234042553, + "grad_norm": 0.6032362580299377, + "learning_rate": 0.0002, + "loss": 1.4265, + "step": 2340 + }, + { + "epoch": 4.0, + "grad_norm": 0.6497282385826111, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 2350 + }, + { + "epoch": 4.0, + "eval_loss": 1.9081238508224487, + "eval_runtime": 106.6404, + "eval_samples_per_second": 4.829, + "eval_steps_per_second": 0.61, + "step": 2350 + }, + { + "epoch": 4.017021276595744, + "grad_norm": 0.6278848648071289, + "learning_rate": 0.0002, + "loss": 1.317, + "step": 2360 + }, + { + "epoch": 4.034042553191489, + "grad_norm": 0.8259812593460083, + "learning_rate": 0.0002, + "loss": 1.3229, + "step": 2370 + }, + { + "epoch": 4.051063829787234, + "grad_norm": 0.7269589304924011, + "learning_rate": 0.0002, + "loss": 1.2776, + "step": 2380 + }, + { + "epoch": 4.068085106382979, + "grad_norm": 0.7460662126541138, + "learning_rate": 0.0002, + "loss": 1.3668, + "step": 2390 + }, + { + "epoch": 4.085106382978723, + "grad_norm": 1.2362046241760254, + "learning_rate": 0.0002, + "loss": 1.3096, + "step": 2400 + }, + { + "epoch": 4.102127659574468, + "grad_norm": 0.7699568867683411, + "learning_rate": 0.0002, + "loss": 1.2906, + "step": 2410 + }, + { + "epoch": 4.1191489361702125, + "grad_norm": 0.8732489347457886, + "learning_rate": 0.0002, + "loss": 1.3005, + "step": 2420 + }, + { + "epoch": 4.136170212765958, + "grad_norm": 0.8331889510154724, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 2430 + }, + { + "epoch": 4.153191489361702, + "grad_norm": 0.6686427593231201, + "learning_rate": 0.0002, + "loss": 1.1861, + "step": 2440 + }, + { + "epoch": 4.170212765957447, + "grad_norm": 0.906380832195282, + "learning_rate": 0.0002, + "loss": 1.316, + "step": 2450 + }, + { + "epoch": 4.187234042553191, + "grad_norm": 0.7269753813743591, + "learning_rate": 0.0002, + "loss": 1.3134, + "step": 2460 + }, + { + "epoch": 4.2042553191489365, + "grad_norm": 0.8556067943572998, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 2470 + }, + { + "epoch": 4.221276595744681, + "grad_norm": 0.7076917886734009, + "learning_rate": 0.0002, + "loss": 1.2935, + "step": 2480 + }, + { + "epoch": 4.238297872340426, + "grad_norm": 0.7596837282180786, + "learning_rate": 0.0002, + "loss": 1.2608, + "step": 2490 + }, + { + "epoch": 4.25531914893617, + "grad_norm": 0.7790552377700806, + "learning_rate": 0.0002, + "loss": 1.2747, + "step": 2500 + }, + { + "epoch": 4.272340425531915, + "grad_norm": 0.8205534219741821, + "learning_rate": 0.0002, + "loss": 1.3438, + "step": 2510 + }, + { + "epoch": 4.2893617021276595, + "grad_norm": 0.7892114520072937, + "learning_rate": 0.0002, + "loss": 1.3058, + "step": 2520 + }, + { + "epoch": 4.306382978723404, + "grad_norm": 0.8907270431518555, + "learning_rate": 0.0002, + "loss": 1.3662, + "step": 2530 + }, + { + "epoch": 4.323404255319149, + "grad_norm": 0.821794331073761, + "learning_rate": 0.0002, + "loss": 1.3168, + "step": 2540 + }, + { + "epoch": 4.340425531914893, + "grad_norm": 0.7305247783660889, + "learning_rate": 0.0002, + "loss": 1.2467, + "step": 2550 + }, + { + "epoch": 4.357446808510638, + "grad_norm": 0.8639982342720032, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 2560 + }, + { + "epoch": 4.374468085106383, + "grad_norm": 0.8883494138717651, + "learning_rate": 0.0002, + "loss": 1.3863, + "step": 2570 + }, + { + "epoch": 4.391489361702128, + "grad_norm": 0.7611730098724365, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 2580 + }, + { + "epoch": 4.408510638297872, + "grad_norm": 0.7793022394180298, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 2590 + }, + { + "epoch": 4.425531914893617, + "grad_norm": 0.979060173034668, + "learning_rate": 0.0002, + "loss": 1.3014, + "step": 2600 + }, + { + "epoch": 4.4425531914893615, + "grad_norm": 0.8320847749710083, + "learning_rate": 0.0002, + "loss": 1.3625, + "step": 2610 + }, + { + "epoch": 4.459574468085107, + "grad_norm": 0.7481992244720459, + "learning_rate": 0.0002, + "loss": 1.3362, + "step": 2620 + }, + { + "epoch": 4.476595744680851, + "grad_norm": 0.783770740032196, + "learning_rate": 0.0002, + "loss": 1.4037, + "step": 2630 + }, + { + "epoch": 4.493617021276596, + "grad_norm": 0.773295521736145, + "learning_rate": 0.0002, + "loss": 1.3049, + "step": 2640 + }, + { + "epoch": 4.51063829787234, + "grad_norm": 0.9206840991973877, + "learning_rate": 0.0002, + "loss": 1.2739, + "step": 2650 + }, + { + "epoch": 4.527659574468085, + "grad_norm": 0.8803266882896423, + "learning_rate": 0.0002, + "loss": 1.3248, + "step": 2660 + }, + { + "epoch": 4.54468085106383, + "grad_norm": 0.9315535426139832, + "learning_rate": 0.0002, + "loss": 1.2898, + "step": 2670 + }, + { + "epoch": 4.561702127659575, + "grad_norm": 0.8610678315162659, + "learning_rate": 0.0002, + "loss": 1.316, + "step": 2680 + }, + { + "epoch": 4.578723404255319, + "grad_norm": 0.7405551671981812, + "learning_rate": 0.0002, + "loss": 1.2633, + "step": 2690 + }, + { + "epoch": 4.595744680851064, + "grad_norm": 1.0238394737243652, + "learning_rate": 0.0002, + "loss": 1.3136, + "step": 2700 + }, + { + "epoch": 4.6127659574468085, + "grad_norm": 0.7814345955848694, + "learning_rate": 0.0002, + "loss": 1.4847, + "step": 2710 + }, + { + "epoch": 4.629787234042553, + "grad_norm": 0.8436329364776611, + "learning_rate": 0.0002, + "loss": 1.295, + "step": 2720 + }, + { + "epoch": 4.646808510638298, + "grad_norm": 0.727214515209198, + "learning_rate": 0.0002, + "loss": 1.3525, + "step": 2730 + }, + { + "epoch": 4.663829787234042, + "grad_norm": 0.8465878367424011, + "learning_rate": 0.0002, + "loss": 1.3878, + "step": 2740 + }, + { + "epoch": 4.680851063829787, + "grad_norm": 0.8218137621879578, + "learning_rate": 0.0002, + "loss": 1.278, + "step": 2750 + }, + { + "epoch": 4.697872340425532, + "grad_norm": 0.7900442481040955, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 2760 + }, + { + "epoch": 4.714893617021277, + "grad_norm": 0.8214074969291687, + "learning_rate": 0.0002, + "loss": 1.3494, + "step": 2770 + }, + { + "epoch": 4.731914893617021, + "grad_norm": 0.7509574890136719, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 2780 + }, + { + "epoch": 4.748936170212766, + "grad_norm": 0.7416139245033264, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 2790 + }, + { + "epoch": 4.76595744680851, + "grad_norm": 0.8629977107048035, + "learning_rate": 0.0002, + "loss": 1.3045, + "step": 2800 + }, + { + "epoch": 4.782978723404256, + "grad_norm": 0.8056505918502808, + "learning_rate": 0.0002, + "loss": 1.3164, + "step": 2810 + }, + { + "epoch": 4.8, + "grad_norm": 0.7705401182174683, + "learning_rate": 0.0002, + "loss": 1.3056, + "step": 2820 + }, + { + "epoch": 4.817021276595745, + "grad_norm": 1.0173288583755493, + "learning_rate": 0.0002, + "loss": 1.3771, + "step": 2830 + }, + { + "epoch": 4.834042553191489, + "grad_norm": 0.8375823497772217, + "learning_rate": 0.0002, + "loss": 1.3494, + "step": 2840 + }, + { + "epoch": 4.851063829787234, + "grad_norm": 0.857073187828064, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 2850 + }, + { + "epoch": 4.868085106382979, + "grad_norm": 0.8672189712524414, + "learning_rate": 0.0002, + "loss": 1.2964, + "step": 2860 + }, + { + "epoch": 4.885106382978723, + "grad_norm": 0.8599910140037537, + "learning_rate": 0.0002, + "loss": 1.3646, + "step": 2870 + }, + { + "epoch": 4.902127659574468, + "grad_norm": 0.8844674229621887, + "learning_rate": 0.0002, + "loss": 1.3575, + "step": 2880 + }, + { + "epoch": 4.919148936170213, + "grad_norm": 0.8246751427650452, + "learning_rate": 0.0002, + "loss": 1.285, + "step": 2890 + }, + { + "epoch": 4.9361702127659575, + "grad_norm": 0.8648163676261902, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 2900 + }, + { + "epoch": 4.953191489361702, + "grad_norm": 0.9477900266647339, + "learning_rate": 0.0002, + "loss": 1.2614, + "step": 2910 + }, + { + "epoch": 4.970212765957447, + "grad_norm": 0.8047965168952942, + "learning_rate": 0.0002, + "loss": 1.3519, + "step": 2920 + }, + { + "epoch": 4.987234042553191, + "grad_norm": 0.9872494339942932, + "learning_rate": 0.0002, + "loss": 1.3889, + "step": 2930 + }, + { + "epoch": 4.999148936170212, + "eval_loss": 1.9836769104003906, + "eval_runtime": 106.4655, + "eval_samples_per_second": 4.837, + "eval_steps_per_second": 0.611, + "step": 2937 + } + ], + "logging_steps": 10, + "max_steps": 4696, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.35940922671104e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..40bc5c2205bcd1402ce337f3a218ef4840b3870a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-2937/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:148f55f87c9f695bdf2cf6d54b37e690c06ff5da5e17bc3af6c1f44f90f45374 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7543a8984c93e6e6caeddeed16da76ec50686507 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed7f43e2c90229ed92868704b8aa6da820857a23e5cd64af385140df449495fd +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc2bee3344b378a5131e1dd554321e095c480b83 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cc8d307d2adebfa873d2ccae3a904ba2efa88b9538f0af7c462c7953a948dd8 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c28612b8b1cbce4dbea6099e6351f83a60c6a108 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fa809c071d9d92daa1155778e071d77ccd96468a160116f1eec384dba567098 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc3b841a2aeafc727229219c61d9e72f1b2c4e2a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54ad454f60c0c1d7a7b5b76a2e1303250f7be05a405d8e2e4750cee81f397310 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..33c80f3c17bf81a8a5e4451d1628acebe1d1d505 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/trainer_state.json @@ -0,0 +1,2545 @@ +{ + "best_metric": 1.8328146934509277, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 3525, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01702127659574468, + "grad_norm": 0.7596228122711182, + "learning_rate": 0.0002, + "loss": 2.5996, + "step": 10 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 0.4860903322696686, + "learning_rate": 0.0002, + "loss": 2.2941, + "step": 20 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 0.4953401982784271, + "learning_rate": 0.0002, + "loss": 2.0018, + "step": 30 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 0.5086901783943176, + "learning_rate": 0.0002, + "loss": 1.9318, + "step": 40 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 0.49050021171569824, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 50 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 0.4922358989715576, + "learning_rate": 0.0002, + "loss": 1.8786, + "step": 60 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 0.4621541202068329, + "learning_rate": 0.0002, + "loss": 1.8812, + "step": 70 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 0.4416729807853699, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 80 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 0.526258111000061, + "learning_rate": 0.0002, + "loss": 1.9298, + "step": 90 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 0.44022637605667114, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 100 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 0.4647711515426636, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 110 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 0.4136318564414978, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 120 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 0.39707672595977783, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 130 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 0.4478105306625366, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 140 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 0.4699741303920746, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 150 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 0.4568363130092621, + "learning_rate": 0.0002, + "loss": 1.8161, + "step": 160 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 0.45078757405281067, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 170 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 0.4127245843410492, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 180 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 0.4042493402957916, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 190 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 0.401487797498703, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 200 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 0.3959457576274872, + "learning_rate": 0.0002, + "loss": 1.8355, + "step": 210 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 0.39865636825561523, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 220 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 0.7225169539451599, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 230 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 0.412801593542099, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 240 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 0.40951448678970337, + "learning_rate": 0.0002, + "loss": 1.8523, + "step": 250 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 0.42788130044937134, + "learning_rate": 0.0002, + "loss": 1.7283, + "step": 260 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 0.41069576144218445, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 270 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 0.3745323717594147, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 280 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 0.3771323263645172, + "learning_rate": 0.0002, + "loss": 1.8484, + "step": 290 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 0.34368929266929626, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 300 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 0.4299296736717224, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 310 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 0.4133922755718231, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 320 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 0.3984859585762024, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 330 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 0.3822788894176483, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 340 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 0.4550061821937561, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 350 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 0.36571192741394043, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 360 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 0.32942914962768555, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 370 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 0.39299526810646057, + "learning_rate": 0.0002, + "loss": 1.7118, + "step": 380 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 0.3817657232284546, + "learning_rate": 0.0002, + "loss": 1.8179, + "step": 390 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 0.3650810122489929, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 0.3736686408519745, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 410 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 0.45680564641952515, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 420 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 0.4154510200023651, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 430 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 0.3701167106628418, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 440 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 0.3869531750679016, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 450 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 0.4391495883464813, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 460 + }, + { + "epoch": 0.8, + "grad_norm": 0.39652755856513977, + "learning_rate": 0.0002, + "loss": 1.7534, + "step": 470 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 0.4096752107143402, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 480 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 0.3857504427433014, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 490 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 0.4105374217033386, + "learning_rate": 0.0002, + "loss": 1.7379, + "step": 500 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 0.3723328113555908, + "learning_rate": 0.0002, + "loss": 1.6391, + "step": 510 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 0.36099690198898315, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 520 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 0.3715187907218933, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 530 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 0.4932813048362732, + "learning_rate": 0.0002, + "loss": 1.7004, + "step": 540 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 0.3493495285511017, + "learning_rate": 0.0002, + "loss": 1.679, + "step": 550 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 0.3598061800003052, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 560 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 0.3521560728549957, + "learning_rate": 0.0002, + "loss": 1.7686, + "step": 570 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 0.34150034189224243, + "learning_rate": 0.0002, + "loss": 1.7391, + "step": 580 + }, + { + "epoch": 0.9991489361702127, + "eval_loss": 1.8388911485671997, + "eval_runtime": 106.6788, + "eval_samples_per_second": 4.828, + "eval_steps_per_second": 0.609, + "step": 587 + }, + { + "epoch": 1.004255319148936, + "grad_norm": 0.33556103706359863, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 590 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 0.4333398640155792, + "learning_rate": 0.0002, + "loss": 1.8346, + "step": 600 + }, + { + "epoch": 1.0382978723404255, + "grad_norm": 0.38488736748695374, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 610 + }, + { + "epoch": 1.0553191489361702, + "grad_norm": 0.44454529881477356, + "learning_rate": 0.0002, + "loss": 1.7778, + "step": 620 + }, + { + "epoch": 1.0723404255319149, + "grad_norm": 0.3735603392124176, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 630 + }, + { + "epoch": 1.0893617021276596, + "grad_norm": 0.38912704586982727, + "learning_rate": 0.0002, + "loss": 1.74, + "step": 640 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 0.4411826431751251, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 650 + }, + { + "epoch": 1.123404255319149, + "grad_norm": 0.4163050353527069, + "learning_rate": 0.0002, + "loss": 1.6901, + "step": 660 + }, + { + "epoch": 1.1404255319148937, + "grad_norm": 0.4187192916870117, + "learning_rate": 0.0002, + "loss": 1.6722, + "step": 670 + }, + { + "epoch": 1.1574468085106382, + "grad_norm": 0.3797093629837036, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 680 + }, + { + "epoch": 1.174468085106383, + "grad_norm": 0.4210026264190674, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 690 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 0.4701998829841614, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 700 + }, + { + "epoch": 1.2085106382978723, + "grad_norm": 0.6331578493118286, + "learning_rate": 0.0002, + "loss": 1.6773, + "step": 710 + }, + { + "epoch": 1.225531914893617, + "grad_norm": 0.41908255219459534, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 720 + }, + { + "epoch": 1.2425531914893617, + "grad_norm": 0.36158403754234314, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 730 + }, + { + "epoch": 1.2595744680851064, + "grad_norm": 0.387300580739975, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 740 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 0.38899728655815125, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 750 + }, + { + "epoch": 1.2936170212765958, + "grad_norm": 0.4549255073070526, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 760 + }, + { + "epoch": 1.3106382978723405, + "grad_norm": 0.4052349328994751, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 770 + }, + { + "epoch": 1.327659574468085, + "grad_norm": 0.38934215903282166, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 780 + }, + { + "epoch": 1.3446808510638297, + "grad_norm": 0.38688382506370544, + "learning_rate": 0.0002, + "loss": 1.7813, + "step": 790 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 0.3825705051422119, + "learning_rate": 0.0002, + "loss": 1.6374, + "step": 800 + }, + { + "epoch": 1.3787234042553191, + "grad_norm": 0.37331756949424744, + "learning_rate": 0.0002, + "loss": 1.655, + "step": 810 + }, + { + "epoch": 1.3957446808510638, + "grad_norm": 0.38826408982276917, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 820 + }, + { + "epoch": 1.4127659574468086, + "grad_norm": 0.4213569164276123, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 830 + }, + { + "epoch": 1.4297872340425533, + "grad_norm": 0.3976684808731079, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 840 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 0.390009343624115, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 850 + }, + { + "epoch": 1.4638297872340424, + "grad_norm": 0.4462052583694458, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 860 + }, + { + "epoch": 1.4808510638297872, + "grad_norm": 0.42129236459732056, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 870 + }, + { + "epoch": 1.4978723404255319, + "grad_norm": 0.41489893198013306, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 880 + }, + { + "epoch": 1.5148936170212766, + "grad_norm": 0.41451677680015564, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 890 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 0.4477299749851227, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 900 + }, + { + "epoch": 1.548936170212766, + "grad_norm": 0.38476648926734924, + "learning_rate": 0.0002, + "loss": 1.6543, + "step": 910 + }, + { + "epoch": 1.5659574468085107, + "grad_norm": 0.42755743861198425, + "learning_rate": 0.0002, + "loss": 1.7103, + "step": 920 + }, + { + "epoch": 1.5829787234042554, + "grad_norm": 0.39372023940086365, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 930 + }, + { + "epoch": 1.6, + "grad_norm": 0.42778754234313965, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 940 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 0.4217268228530884, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 950 + }, + { + "epoch": 1.6340425531914895, + "grad_norm": 0.40452107787132263, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 960 + }, + { + "epoch": 1.6510638297872342, + "grad_norm": 0.4259980022907257, + "learning_rate": 0.0002, + "loss": 1.6734, + "step": 970 + }, + { + "epoch": 1.6680851063829787, + "grad_norm": 0.4089849591255188, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 980 + }, + { + "epoch": 1.6851063829787234, + "grad_norm": 0.38276049494743347, + "learning_rate": 0.0002, + "loss": 1.6588, + "step": 990 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 0.40361565351486206, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 1000 + }, + { + "epoch": 1.7191489361702128, + "grad_norm": 0.3537807762622833, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 1010 + }, + { + "epoch": 1.7361702127659573, + "grad_norm": 0.40288347005844116, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 1020 + }, + { + "epoch": 1.753191489361702, + "grad_norm": 0.4003616273403168, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1030 + }, + { + "epoch": 1.7702127659574467, + "grad_norm": 0.3931669592857361, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 1040 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 0.4001635015010834, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1050 + }, + { + "epoch": 1.804255319148936, + "grad_norm": 0.4139048457145691, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 1060 + }, + { + "epoch": 1.8212765957446808, + "grad_norm": 0.5044458508491516, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 1070 + }, + { + "epoch": 1.8382978723404255, + "grad_norm": 0.4827095568180084, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 1080 + }, + { + "epoch": 1.8553191489361702, + "grad_norm": 0.3750515282154083, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 1090 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 0.4024597704410553, + "learning_rate": 0.0002, + "loss": 1.653, + "step": 1100 + }, + { + "epoch": 1.8893617021276596, + "grad_norm": 0.36747241020202637, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1110 + }, + { + "epoch": 1.9063829787234043, + "grad_norm": 0.41397711634635925, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 1120 + }, + { + "epoch": 1.923404255319149, + "grad_norm": 0.3960763216018677, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 1130 + }, + { + "epoch": 1.9404255319148938, + "grad_norm": 0.4533233344554901, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 1140 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 0.38433438539505005, + "learning_rate": 0.0002, + "loss": 1.7168, + "step": 1150 + }, + { + "epoch": 1.974468085106383, + "grad_norm": 0.3648812174797058, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1160 + }, + { + "epoch": 1.9914893617021276, + "grad_norm": 0.3887176215648651, + "learning_rate": 0.0002, + "loss": 1.6521, + "step": 1170 + }, + { + "epoch": 2.0, + "eval_loss": 1.8328146934509277, + "eval_runtime": 107.2842, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 0.606, + "step": 1175 + }, + { + "epoch": 2.008510638297872, + "grad_norm": 0.40444880723953247, + "learning_rate": 0.0002, + "loss": 1.6184, + "step": 1180 + }, + { + "epoch": 2.025531914893617, + "grad_norm": 0.3997816741466522, + "learning_rate": 0.0002, + "loss": 1.5221, + "step": 1190 + }, + { + "epoch": 2.0425531914893615, + "grad_norm": 0.4516718089580536, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 1200 + }, + { + "epoch": 2.0595744680851062, + "grad_norm": 0.6645553708076477, + "learning_rate": 0.0002, + "loss": 1.5458, + "step": 1210 + }, + { + "epoch": 2.076595744680851, + "grad_norm": 0.4181990921497345, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 1220 + }, + { + "epoch": 2.0936170212765957, + "grad_norm": 0.45681431889533997, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 1230 + }, + { + "epoch": 2.1106382978723404, + "grad_norm": 0.48914700746536255, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 1240 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 0.43265485763549805, + "learning_rate": 0.0002, + "loss": 1.6031, + "step": 1250 + }, + { + "epoch": 2.1446808510638298, + "grad_norm": 0.4641207754611969, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 1260 + }, + { + "epoch": 2.1617021276595745, + "grad_norm": 0.4840783476829529, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 1270 + }, + { + "epoch": 2.178723404255319, + "grad_norm": 0.4974595308303833, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 1280 + }, + { + "epoch": 2.195744680851064, + "grad_norm": 0.5133475661277771, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 1290 + }, + { + "epoch": 2.2127659574468086, + "grad_norm": 0.5030052065849304, + "learning_rate": 0.0002, + "loss": 1.6467, + "step": 1300 + }, + { + "epoch": 2.2297872340425533, + "grad_norm": 0.46602481603622437, + "learning_rate": 0.0002, + "loss": 1.6845, + "step": 1310 + }, + { + "epoch": 2.246808510638298, + "grad_norm": 0.43662378191947937, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1320 + }, + { + "epoch": 2.2638297872340427, + "grad_norm": 0.5137454867362976, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1330 + }, + { + "epoch": 2.2808510638297874, + "grad_norm": 0.4750335216522217, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 1340 + }, + { + "epoch": 2.297872340425532, + "grad_norm": 0.43691426515579224, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 1350 + }, + { + "epoch": 2.3148936170212764, + "grad_norm": 0.49752047657966614, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 1360 + }, + { + "epoch": 2.331914893617021, + "grad_norm": 0.45101815462112427, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 1370 + }, + { + "epoch": 2.348936170212766, + "grad_norm": 0.4427817761898041, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 1380 + }, + { + "epoch": 2.3659574468085105, + "grad_norm": 0.4802311062812805, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 1390 + }, + { + "epoch": 2.382978723404255, + "grad_norm": 0.4512513279914856, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 1400 + }, + { + "epoch": 2.4, + "grad_norm": 0.4878857135772705, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 1410 + }, + { + "epoch": 2.4170212765957446, + "grad_norm": 0.4741315543651581, + "learning_rate": 0.0002, + "loss": 1.5781, + "step": 1420 + }, + { + "epoch": 2.4340425531914893, + "grad_norm": 0.4770931601524353, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 1430 + }, + { + "epoch": 2.451063829787234, + "grad_norm": 0.5124667286872864, + "learning_rate": 0.0002, + "loss": 1.5336, + "step": 1440 + }, + { + "epoch": 2.4680851063829787, + "grad_norm": 0.45264801383018494, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 1450 + }, + { + "epoch": 2.4851063829787234, + "grad_norm": 0.5456924438476562, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 1460 + }, + { + "epoch": 2.502127659574468, + "grad_norm": 0.44656285643577576, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 1470 + }, + { + "epoch": 2.519148936170213, + "grad_norm": 0.5939419865608215, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 1480 + }, + { + "epoch": 2.5361702127659576, + "grad_norm": 0.47853362560272217, + "learning_rate": 0.0002, + "loss": 1.5481, + "step": 1490 + }, + { + "epoch": 2.5531914893617023, + "grad_norm": 0.47643396258354187, + "learning_rate": 0.0002, + "loss": 1.6543, + "step": 1500 + }, + { + "epoch": 2.570212765957447, + "grad_norm": 0.4939501881599426, + "learning_rate": 0.0002, + "loss": 1.5085, + "step": 1510 + }, + { + "epoch": 2.5872340425531917, + "grad_norm": 0.502055287361145, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 1520 + }, + { + "epoch": 2.604255319148936, + "grad_norm": 0.463250994682312, + "learning_rate": 0.0002, + "loss": 1.6519, + "step": 1530 + }, + { + "epoch": 2.621276595744681, + "grad_norm": 0.4761098623275757, + "learning_rate": 0.0002, + "loss": 1.5698, + "step": 1540 + }, + { + "epoch": 2.6382978723404253, + "grad_norm": 0.4687299132347107, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 1550 + }, + { + "epoch": 2.65531914893617, + "grad_norm": 0.5536078810691833, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1560 + }, + { + "epoch": 2.6723404255319148, + "grad_norm": 0.581320583820343, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 1570 + }, + { + "epoch": 2.6893617021276595, + "grad_norm": 0.45952868461608887, + "learning_rate": 0.0002, + "loss": 1.508, + "step": 1580 + }, + { + "epoch": 2.706382978723404, + "grad_norm": 0.4602586328983307, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 1590 + }, + { + "epoch": 2.723404255319149, + "grad_norm": 0.5276554226875305, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 1600 + }, + { + "epoch": 2.7404255319148936, + "grad_norm": 0.5750249624252319, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 1610 + }, + { + "epoch": 2.7574468085106383, + "grad_norm": 0.468723863363266, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 1620 + }, + { + "epoch": 2.774468085106383, + "grad_norm": 0.44649943709373474, + "learning_rate": 0.0002, + "loss": 1.4499, + "step": 1630 + }, + { + "epoch": 2.7914893617021277, + "grad_norm": 0.5097237825393677, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 1640 + }, + { + "epoch": 2.8085106382978724, + "grad_norm": 0.46384191513061523, + "learning_rate": 0.0002, + "loss": 1.5948, + "step": 1650 + }, + { + "epoch": 2.825531914893617, + "grad_norm": 0.4885474443435669, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 1660 + }, + { + "epoch": 2.842553191489362, + "grad_norm": 0.45621681213378906, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 1670 + }, + { + "epoch": 2.8595744680851065, + "grad_norm": 0.4797150194644928, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 1680 + }, + { + "epoch": 2.876595744680851, + "grad_norm": 0.5142032504081726, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 1690 + }, + { + "epoch": 2.8936170212765955, + "grad_norm": 0.48939862847328186, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 1700 + }, + { + "epoch": 2.9106382978723406, + "grad_norm": 0.4575578272342682, + "learning_rate": 0.0002, + "loss": 1.6333, + "step": 1710 + }, + { + "epoch": 2.927659574468085, + "grad_norm": 0.5589063763618469, + "learning_rate": 0.0002, + "loss": 1.5396, + "step": 1720 + }, + { + "epoch": 2.94468085106383, + "grad_norm": 0.48508813977241516, + "learning_rate": 0.0002, + "loss": 1.6096, + "step": 1730 + }, + { + "epoch": 2.9617021276595743, + "grad_norm": 0.42786726355552673, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 1740 + }, + { + "epoch": 2.978723404255319, + "grad_norm": 0.5598229765892029, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1750 + }, + { + "epoch": 2.9957446808510637, + "grad_norm": 0.4779253602027893, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 1760 + }, + { + "epoch": 2.999148936170213, + "eval_loss": 1.8543579578399658, + "eval_runtime": 107.2363, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 1762 + }, + { + "epoch": 3.0127659574468084, + "grad_norm": 0.48810940980911255, + "learning_rate": 0.0002, + "loss": 1.4767, + "step": 1770 + }, + { + "epoch": 3.029787234042553, + "grad_norm": 0.6194920539855957, + "learning_rate": 0.0002, + "loss": 1.5385, + "step": 1780 + }, + { + "epoch": 3.046808510638298, + "grad_norm": 0.5875462293624878, + "learning_rate": 0.0002, + "loss": 1.4012, + "step": 1790 + }, + { + "epoch": 3.0638297872340425, + "grad_norm": 0.5775138139724731, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 1800 + }, + { + "epoch": 3.0808510638297872, + "grad_norm": 0.5445981621742249, + "learning_rate": 0.0002, + "loss": 1.493, + "step": 1810 + }, + { + "epoch": 3.097872340425532, + "grad_norm": 0.6728862524032593, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 1820 + }, + { + "epoch": 3.1148936170212767, + "grad_norm": 0.6105490326881409, + "learning_rate": 0.0002, + "loss": 1.4303, + "step": 1830 + }, + { + "epoch": 3.1319148936170214, + "grad_norm": 0.5771165490150452, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 1840 + }, + { + "epoch": 3.148936170212766, + "grad_norm": 0.5778449773788452, + "learning_rate": 0.0002, + "loss": 1.4359, + "step": 1850 + }, + { + "epoch": 3.1659574468085108, + "grad_norm": 0.7141990661621094, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 1860 + }, + { + "epoch": 3.1829787234042555, + "grad_norm": 0.5882705450057983, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 1870 + }, + { + "epoch": 3.2, + "grad_norm": 0.5996195077896118, + "learning_rate": 0.0002, + "loss": 1.4941, + "step": 1880 + }, + { + "epoch": 3.217021276595745, + "grad_norm": 0.6121219396591187, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 1890 + }, + { + "epoch": 3.2340425531914896, + "grad_norm": 0.6402981281280518, + "learning_rate": 0.0002, + "loss": 1.4586, + "step": 1900 + }, + { + "epoch": 3.251063829787234, + "grad_norm": 0.6111783981323242, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 1910 + }, + { + "epoch": 3.2680851063829786, + "grad_norm": 0.6682435274124146, + "learning_rate": 0.0002, + "loss": 1.4863, + "step": 1920 + }, + { + "epoch": 3.2851063829787233, + "grad_norm": 0.6530760526657104, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 1930 + }, + { + "epoch": 3.302127659574468, + "grad_norm": 0.6481217741966248, + "learning_rate": 0.0002, + "loss": 1.4422, + "step": 1940 + }, + { + "epoch": 3.3191489361702127, + "grad_norm": 0.6270697116851807, + "learning_rate": 0.0002, + "loss": 1.5158, + "step": 1950 + }, + { + "epoch": 3.3361702127659574, + "grad_norm": 0.5924492478370667, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 1960 + }, + { + "epoch": 3.353191489361702, + "grad_norm": 0.5803806781768799, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 1970 + }, + { + "epoch": 3.370212765957447, + "grad_norm": 0.5754119157791138, + "learning_rate": 0.0002, + "loss": 1.4689, + "step": 1980 + }, + { + "epoch": 3.3872340425531915, + "grad_norm": 0.6717178821563721, + "learning_rate": 0.0002, + "loss": 1.4605, + "step": 1990 + }, + { + "epoch": 3.404255319148936, + "grad_norm": 0.5955582857131958, + "learning_rate": 0.0002, + "loss": 1.486, + "step": 2000 + }, + { + "epoch": 3.421276595744681, + "grad_norm": 0.6965329647064209, + "learning_rate": 0.0002, + "loss": 1.4445, + "step": 2010 + }, + { + "epoch": 3.4382978723404256, + "grad_norm": 0.6321573257446289, + "learning_rate": 0.0002, + "loss": 1.4543, + "step": 2020 + }, + { + "epoch": 3.4553191489361703, + "grad_norm": 0.5952608585357666, + "learning_rate": 0.0002, + "loss": 1.5383, + "step": 2030 + }, + { + "epoch": 3.472340425531915, + "grad_norm": 0.7718905806541443, + "learning_rate": 0.0002, + "loss": 1.4531, + "step": 2040 + }, + { + "epoch": 3.4893617021276597, + "grad_norm": 0.6850892305374146, + "learning_rate": 0.0002, + "loss": 1.4678, + "step": 2050 + }, + { + "epoch": 3.506382978723404, + "grad_norm": 0.5638895630836487, + "learning_rate": 0.0002, + "loss": 1.4956, + "step": 2060 + }, + { + "epoch": 3.523404255319149, + "grad_norm": 0.6148294806480408, + "learning_rate": 0.0002, + "loss": 1.4586, + "step": 2070 + }, + { + "epoch": 3.5404255319148934, + "grad_norm": 0.5895810723304749, + "learning_rate": 0.0002, + "loss": 1.4622, + "step": 2080 + }, + { + "epoch": 3.5574468085106385, + "grad_norm": 0.6377319693565369, + "learning_rate": 0.0002, + "loss": 1.4341, + "step": 2090 + }, + { + "epoch": 3.574468085106383, + "grad_norm": 0.6047691702842712, + "learning_rate": 0.0002, + "loss": 1.5056, + "step": 2100 + }, + { + "epoch": 3.5914893617021275, + "grad_norm": 0.6049593687057495, + "learning_rate": 0.0002, + "loss": 1.4748, + "step": 2110 + }, + { + "epoch": 3.608510638297872, + "grad_norm": 0.6358312368392944, + "learning_rate": 0.0002, + "loss": 1.391, + "step": 2120 + }, + { + "epoch": 3.625531914893617, + "grad_norm": 0.612119197845459, + "learning_rate": 0.0002, + "loss": 1.4419, + "step": 2130 + }, + { + "epoch": 3.6425531914893616, + "grad_norm": 0.6788054704666138, + "learning_rate": 0.0002, + "loss": 1.438, + "step": 2140 + }, + { + "epoch": 3.6595744680851063, + "grad_norm": 0.6191043853759766, + "learning_rate": 0.0002, + "loss": 1.4295, + "step": 2150 + }, + { + "epoch": 3.676595744680851, + "grad_norm": 0.6660051941871643, + "learning_rate": 0.0002, + "loss": 1.4383, + "step": 2160 + }, + { + "epoch": 3.6936170212765957, + "grad_norm": 0.652692973613739, + "learning_rate": 0.0002, + "loss": 1.4954, + "step": 2170 + }, + { + "epoch": 3.7106382978723405, + "grad_norm": 0.6123467087745667, + "learning_rate": 0.0002, + "loss": 1.5245, + "step": 2180 + }, + { + "epoch": 3.727659574468085, + "grad_norm": 0.640021562576294, + "learning_rate": 0.0002, + "loss": 1.4686, + "step": 2190 + }, + { + "epoch": 3.74468085106383, + "grad_norm": 0.6809179782867432, + "learning_rate": 0.0002, + "loss": 1.4277, + "step": 2200 + }, + { + "epoch": 3.7617021276595746, + "grad_norm": 0.5978420376777649, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 2210 + }, + { + "epoch": 3.7787234042553193, + "grad_norm": 0.7038803100585938, + "learning_rate": 0.0002, + "loss": 1.5559, + "step": 2220 + }, + { + "epoch": 3.795744680851064, + "grad_norm": 0.5324276089668274, + "learning_rate": 0.0002, + "loss": 1.4691, + "step": 2230 + }, + { + "epoch": 3.8127659574468087, + "grad_norm": 0.6264132857322693, + "learning_rate": 0.0002, + "loss": 1.4696, + "step": 2240 + }, + { + "epoch": 3.829787234042553, + "grad_norm": 0.6143888831138611, + "learning_rate": 0.0002, + "loss": 1.4856, + "step": 2250 + }, + { + "epoch": 3.846808510638298, + "grad_norm": 0.6338503360748291, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 2260 + }, + { + "epoch": 3.8638297872340424, + "grad_norm": 0.556882381439209, + "learning_rate": 0.0002, + "loss": 1.456, + "step": 2270 + }, + { + "epoch": 3.8808510638297875, + "grad_norm": 0.6323680281639099, + "learning_rate": 0.0002, + "loss": 1.4701, + "step": 2280 + }, + { + "epoch": 3.8978723404255318, + "grad_norm": 0.7105869054794312, + "learning_rate": 0.0002, + "loss": 1.5333, + "step": 2290 + }, + { + "epoch": 3.9148936170212765, + "grad_norm": 0.825415849685669, + "learning_rate": 0.0002, + "loss": 1.4462, + "step": 2300 + }, + { + "epoch": 3.931914893617021, + "grad_norm": 0.6412091851234436, + "learning_rate": 0.0002, + "loss": 1.5023, + "step": 2310 + }, + { + "epoch": 3.948936170212766, + "grad_norm": 0.6286490559577942, + "learning_rate": 0.0002, + "loss": 1.3709, + "step": 2320 + }, + { + "epoch": 3.9659574468085106, + "grad_norm": 0.636021077632904, + "learning_rate": 0.0002, + "loss": 1.4693, + "step": 2330 + }, + { + "epoch": 3.9829787234042553, + "grad_norm": 0.6032362580299377, + "learning_rate": 0.0002, + "loss": 1.4265, + "step": 2340 + }, + { + "epoch": 4.0, + "grad_norm": 0.6497282385826111, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 2350 + }, + { + "epoch": 4.0, + "eval_loss": 1.9081238508224487, + "eval_runtime": 106.6404, + "eval_samples_per_second": 4.829, + "eval_steps_per_second": 0.61, + "step": 2350 + }, + { + "epoch": 4.017021276595744, + "grad_norm": 0.6278848648071289, + "learning_rate": 0.0002, + "loss": 1.317, + "step": 2360 + }, + { + "epoch": 4.034042553191489, + "grad_norm": 0.8259812593460083, + "learning_rate": 0.0002, + "loss": 1.3229, + "step": 2370 + }, + { + "epoch": 4.051063829787234, + "grad_norm": 0.7269589304924011, + "learning_rate": 0.0002, + "loss": 1.2776, + "step": 2380 + }, + { + "epoch": 4.068085106382979, + "grad_norm": 0.7460662126541138, + "learning_rate": 0.0002, + "loss": 1.3668, + "step": 2390 + }, + { + "epoch": 4.085106382978723, + "grad_norm": 1.2362046241760254, + "learning_rate": 0.0002, + "loss": 1.3096, + "step": 2400 + }, + { + "epoch": 4.102127659574468, + "grad_norm": 0.7699568867683411, + "learning_rate": 0.0002, + "loss": 1.2906, + "step": 2410 + }, + { + "epoch": 4.1191489361702125, + "grad_norm": 0.8732489347457886, + "learning_rate": 0.0002, + "loss": 1.3005, + "step": 2420 + }, + { + "epoch": 4.136170212765958, + "grad_norm": 0.8331889510154724, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 2430 + }, + { + "epoch": 4.153191489361702, + "grad_norm": 0.6686427593231201, + "learning_rate": 0.0002, + "loss": 1.1861, + "step": 2440 + }, + { + "epoch": 4.170212765957447, + "grad_norm": 0.906380832195282, + "learning_rate": 0.0002, + "loss": 1.316, + "step": 2450 + }, + { + "epoch": 4.187234042553191, + "grad_norm": 0.7269753813743591, + "learning_rate": 0.0002, + "loss": 1.3134, + "step": 2460 + }, + { + "epoch": 4.2042553191489365, + "grad_norm": 0.8556067943572998, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 2470 + }, + { + "epoch": 4.221276595744681, + "grad_norm": 0.7076917886734009, + "learning_rate": 0.0002, + "loss": 1.2935, + "step": 2480 + }, + { + "epoch": 4.238297872340426, + "grad_norm": 0.7596837282180786, + "learning_rate": 0.0002, + "loss": 1.2608, + "step": 2490 + }, + { + "epoch": 4.25531914893617, + "grad_norm": 0.7790552377700806, + "learning_rate": 0.0002, + "loss": 1.2747, + "step": 2500 + }, + { + "epoch": 4.272340425531915, + "grad_norm": 0.8205534219741821, + "learning_rate": 0.0002, + "loss": 1.3438, + "step": 2510 + }, + { + "epoch": 4.2893617021276595, + "grad_norm": 0.7892114520072937, + "learning_rate": 0.0002, + "loss": 1.3058, + "step": 2520 + }, + { + "epoch": 4.306382978723404, + "grad_norm": 0.8907270431518555, + "learning_rate": 0.0002, + "loss": 1.3662, + "step": 2530 + }, + { + "epoch": 4.323404255319149, + "grad_norm": 0.821794331073761, + "learning_rate": 0.0002, + "loss": 1.3168, + "step": 2540 + }, + { + "epoch": 4.340425531914893, + "grad_norm": 0.7305247783660889, + "learning_rate": 0.0002, + "loss": 1.2467, + "step": 2550 + }, + { + "epoch": 4.357446808510638, + "grad_norm": 0.8639982342720032, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 2560 + }, + { + "epoch": 4.374468085106383, + "grad_norm": 0.8883494138717651, + "learning_rate": 0.0002, + "loss": 1.3863, + "step": 2570 + }, + { + "epoch": 4.391489361702128, + "grad_norm": 0.7611730098724365, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 2580 + }, + { + "epoch": 4.408510638297872, + "grad_norm": 0.7793022394180298, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 2590 + }, + { + "epoch": 4.425531914893617, + "grad_norm": 0.979060173034668, + "learning_rate": 0.0002, + "loss": 1.3014, + "step": 2600 + }, + { + "epoch": 4.4425531914893615, + "grad_norm": 0.8320847749710083, + "learning_rate": 0.0002, + "loss": 1.3625, + "step": 2610 + }, + { + "epoch": 4.459574468085107, + "grad_norm": 0.7481992244720459, + "learning_rate": 0.0002, + "loss": 1.3362, + "step": 2620 + }, + { + "epoch": 4.476595744680851, + "grad_norm": 0.783770740032196, + "learning_rate": 0.0002, + "loss": 1.4037, + "step": 2630 + }, + { + "epoch": 4.493617021276596, + "grad_norm": 0.773295521736145, + "learning_rate": 0.0002, + "loss": 1.3049, + "step": 2640 + }, + { + "epoch": 4.51063829787234, + "grad_norm": 0.9206840991973877, + "learning_rate": 0.0002, + "loss": 1.2739, + "step": 2650 + }, + { + "epoch": 4.527659574468085, + "grad_norm": 0.8803266882896423, + "learning_rate": 0.0002, + "loss": 1.3248, + "step": 2660 + }, + { + "epoch": 4.54468085106383, + "grad_norm": 0.9315535426139832, + "learning_rate": 0.0002, + "loss": 1.2898, + "step": 2670 + }, + { + "epoch": 4.561702127659575, + "grad_norm": 0.8610678315162659, + "learning_rate": 0.0002, + "loss": 1.316, + "step": 2680 + }, + { + "epoch": 4.578723404255319, + "grad_norm": 0.7405551671981812, + "learning_rate": 0.0002, + "loss": 1.2633, + "step": 2690 + }, + { + "epoch": 4.595744680851064, + "grad_norm": 1.0238394737243652, + "learning_rate": 0.0002, + "loss": 1.3136, + "step": 2700 + }, + { + "epoch": 4.6127659574468085, + "grad_norm": 0.7814345955848694, + "learning_rate": 0.0002, + "loss": 1.4847, + "step": 2710 + }, + { + "epoch": 4.629787234042553, + "grad_norm": 0.8436329364776611, + "learning_rate": 0.0002, + "loss": 1.295, + "step": 2720 + }, + { + "epoch": 4.646808510638298, + "grad_norm": 0.727214515209198, + "learning_rate": 0.0002, + "loss": 1.3525, + "step": 2730 + }, + { + "epoch": 4.663829787234042, + "grad_norm": 0.8465878367424011, + "learning_rate": 0.0002, + "loss": 1.3878, + "step": 2740 + }, + { + "epoch": 4.680851063829787, + "grad_norm": 0.8218137621879578, + "learning_rate": 0.0002, + "loss": 1.278, + "step": 2750 + }, + { + "epoch": 4.697872340425532, + "grad_norm": 0.7900442481040955, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 2760 + }, + { + "epoch": 4.714893617021277, + "grad_norm": 0.8214074969291687, + "learning_rate": 0.0002, + "loss": 1.3494, + "step": 2770 + }, + { + "epoch": 4.731914893617021, + "grad_norm": 0.7509574890136719, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 2780 + }, + { + "epoch": 4.748936170212766, + "grad_norm": 0.7416139245033264, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 2790 + }, + { + "epoch": 4.76595744680851, + "grad_norm": 0.8629977107048035, + "learning_rate": 0.0002, + "loss": 1.3045, + "step": 2800 + }, + { + "epoch": 4.782978723404256, + "grad_norm": 0.8056505918502808, + "learning_rate": 0.0002, + "loss": 1.3164, + "step": 2810 + }, + { + "epoch": 4.8, + "grad_norm": 0.7705401182174683, + "learning_rate": 0.0002, + "loss": 1.3056, + "step": 2820 + }, + { + "epoch": 4.817021276595745, + "grad_norm": 1.0173288583755493, + "learning_rate": 0.0002, + "loss": 1.3771, + "step": 2830 + }, + { + "epoch": 4.834042553191489, + "grad_norm": 0.8375823497772217, + "learning_rate": 0.0002, + "loss": 1.3494, + "step": 2840 + }, + { + "epoch": 4.851063829787234, + "grad_norm": 0.857073187828064, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 2850 + }, + { + "epoch": 4.868085106382979, + "grad_norm": 0.8672189712524414, + "learning_rate": 0.0002, + "loss": 1.2964, + "step": 2860 + }, + { + "epoch": 4.885106382978723, + "grad_norm": 0.8599910140037537, + "learning_rate": 0.0002, + "loss": 1.3646, + "step": 2870 + }, + { + "epoch": 4.902127659574468, + "grad_norm": 0.8844674229621887, + "learning_rate": 0.0002, + "loss": 1.3575, + "step": 2880 + }, + { + "epoch": 4.919148936170213, + "grad_norm": 0.8246751427650452, + "learning_rate": 0.0002, + "loss": 1.285, + "step": 2890 + }, + { + "epoch": 4.9361702127659575, + "grad_norm": 0.8648163676261902, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 2900 + }, + { + "epoch": 4.953191489361702, + "grad_norm": 0.9477900266647339, + "learning_rate": 0.0002, + "loss": 1.2614, + "step": 2910 + }, + { + "epoch": 4.970212765957447, + "grad_norm": 0.8047965168952942, + "learning_rate": 0.0002, + "loss": 1.3519, + "step": 2920 + }, + { + "epoch": 4.987234042553191, + "grad_norm": 0.9872494339942932, + "learning_rate": 0.0002, + "loss": 1.3889, + "step": 2930 + }, + { + "epoch": 4.999148936170212, + "eval_loss": 1.9836769104003906, + "eval_runtime": 106.4655, + "eval_samples_per_second": 4.837, + "eval_steps_per_second": 0.611, + "step": 2937 + }, + { + "epoch": 5.004255319148936, + "grad_norm": 0.7292938828468323, + "learning_rate": 0.0002, + "loss": 1.2574, + "step": 2940 + }, + { + "epoch": 5.0212765957446805, + "grad_norm": 0.8610548973083496, + "learning_rate": 0.0002, + "loss": 1.1312, + "step": 2950 + }, + { + "epoch": 5.038297872340426, + "grad_norm": 0.8384576439857483, + "learning_rate": 0.0002, + "loss": 1.1105, + "step": 2960 + }, + { + "epoch": 5.05531914893617, + "grad_norm": 0.9746620059013367, + "learning_rate": 0.0002, + "loss": 1.1412, + "step": 2970 + }, + { + "epoch": 5.072340425531915, + "grad_norm": 0.8879048228263855, + "learning_rate": 0.0002, + "loss": 1.1687, + "step": 2980 + }, + { + "epoch": 5.089361702127659, + "grad_norm": 0.9006168246269226, + "learning_rate": 0.0002, + "loss": 1.1333, + "step": 2990 + }, + { + "epoch": 5.1063829787234045, + "grad_norm": 0.9770249128341675, + "learning_rate": 0.0002, + "loss": 1.1659, + "step": 3000 + }, + { + "epoch": 5.123404255319149, + "grad_norm": 1.267967939376831, + "learning_rate": 0.0002, + "loss": 1.1334, + "step": 3010 + }, + { + "epoch": 5.140425531914894, + "grad_norm": 0.9857587218284607, + "learning_rate": 0.0002, + "loss": 1.2095, + "step": 3020 + }, + { + "epoch": 5.157446808510638, + "grad_norm": 1.2938690185546875, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 3030 + }, + { + "epoch": 5.174468085106383, + "grad_norm": 0.8928244113922119, + "learning_rate": 0.0002, + "loss": 1.1645, + "step": 3040 + }, + { + "epoch": 5.191489361702128, + "grad_norm": 1.1087630987167358, + "learning_rate": 0.0002, + "loss": 1.1553, + "step": 3050 + }, + { + "epoch": 5.208510638297873, + "grad_norm": 0.9431360960006714, + "learning_rate": 0.0002, + "loss": 1.1416, + "step": 3060 + }, + { + "epoch": 5.225531914893617, + "grad_norm": 1.2048338651657104, + "learning_rate": 0.0002, + "loss": 1.1635, + "step": 3070 + }, + { + "epoch": 5.242553191489361, + "grad_norm": 1.0017054080963135, + "learning_rate": 0.0002, + "loss": 1.171, + "step": 3080 + }, + { + "epoch": 5.259574468085106, + "grad_norm": 1.2771434783935547, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 3090 + }, + { + "epoch": 5.276595744680851, + "grad_norm": 1.4307383298873901, + "learning_rate": 0.0002, + "loss": 1.1478, + "step": 3100 + }, + { + "epoch": 5.293617021276596, + "grad_norm": 1.2460752725601196, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 3110 + }, + { + "epoch": 5.31063829787234, + "grad_norm": 1.693974494934082, + "learning_rate": 0.0002, + "loss": 1.235, + "step": 3120 + }, + { + "epoch": 5.327659574468085, + "grad_norm": 0.9855408668518066, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 3130 + }, + { + "epoch": 5.3446808510638295, + "grad_norm": 1.307521104812622, + "learning_rate": 0.0002, + "loss": 1.2068, + "step": 3140 + }, + { + "epoch": 5.361702127659575, + "grad_norm": 0.957661509513855, + "learning_rate": 0.0002, + "loss": 1.2144, + "step": 3150 + }, + { + "epoch": 5.378723404255319, + "grad_norm": 0.870373010635376, + "learning_rate": 0.0002, + "loss": 1.1305, + "step": 3160 + }, + { + "epoch": 5.395744680851064, + "grad_norm": 0.9324309229850769, + "learning_rate": 0.0002, + "loss": 1.2196, + "step": 3170 + }, + { + "epoch": 5.412765957446808, + "grad_norm": 1.0142403841018677, + "learning_rate": 0.0002, + "loss": 1.1691, + "step": 3180 + }, + { + "epoch": 5.4297872340425535, + "grad_norm": 0.9759578704833984, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 3190 + }, + { + "epoch": 5.446808510638298, + "grad_norm": 0.9021993279457092, + "learning_rate": 0.0002, + "loss": 1.1321, + "step": 3200 + }, + { + "epoch": 5.463829787234043, + "grad_norm": 1.007728934288025, + "learning_rate": 0.0002, + "loss": 1.2222, + "step": 3210 + }, + { + "epoch": 5.480851063829787, + "grad_norm": 0.8969265222549438, + "learning_rate": 0.0002, + "loss": 1.1517, + "step": 3220 + }, + { + "epoch": 5.497872340425532, + "grad_norm": 0.9672483801841736, + "learning_rate": 0.0002, + "loss": 1.2061, + "step": 3230 + }, + { + "epoch": 5.514893617021277, + "grad_norm": 1.1417138576507568, + "learning_rate": 0.0002, + "loss": 1.1454, + "step": 3240 + }, + { + "epoch": 5.531914893617021, + "grad_norm": 0.9669530391693115, + "learning_rate": 0.0002, + "loss": 1.1871, + "step": 3250 + }, + { + "epoch": 5.548936170212766, + "grad_norm": 1.0161820650100708, + "learning_rate": 0.0002, + "loss": 1.1382, + "step": 3260 + }, + { + "epoch": 5.565957446808511, + "grad_norm": 0.9935774803161621, + "learning_rate": 0.0002, + "loss": 1.1708, + "step": 3270 + }, + { + "epoch": 5.582978723404255, + "grad_norm": 1.2572048902511597, + "learning_rate": 0.0002, + "loss": 1.1384, + "step": 3280 + }, + { + "epoch": 5.6, + "grad_norm": 0.9614662528038025, + "learning_rate": 0.0002, + "loss": 1.1711, + "step": 3290 + }, + { + "epoch": 5.617021276595745, + "grad_norm": 0.9835584163665771, + "learning_rate": 0.0002, + "loss": 1.219, + "step": 3300 + }, + { + "epoch": 5.634042553191489, + "grad_norm": 0.9387389421463013, + "learning_rate": 0.0002, + "loss": 1.2074, + "step": 3310 + }, + { + "epoch": 5.651063829787234, + "grad_norm": 0.9348428249359131, + "learning_rate": 0.0002, + "loss": 1.1148, + "step": 3320 + }, + { + "epoch": 5.6680851063829785, + "grad_norm": 0.9636440873146057, + "learning_rate": 0.0002, + "loss": 1.2378, + "step": 3330 + }, + { + "epoch": 5.685106382978724, + "grad_norm": 0.995894193649292, + "learning_rate": 0.0002, + "loss": 1.2068, + "step": 3340 + }, + { + "epoch": 5.702127659574468, + "grad_norm": 1.0357023477554321, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 3350 + }, + { + "epoch": 5.719148936170213, + "grad_norm": 1.0254428386688232, + "learning_rate": 0.0002, + "loss": 1.2209, + "step": 3360 + }, + { + "epoch": 5.736170212765957, + "grad_norm": 0.8993342518806458, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 3370 + }, + { + "epoch": 5.753191489361702, + "grad_norm": 0.9104585647583008, + "learning_rate": 0.0002, + "loss": 1.1527, + "step": 3380 + }, + { + "epoch": 5.770212765957447, + "grad_norm": 0.9555654525756836, + "learning_rate": 0.0002, + "loss": 1.2268, + "step": 3390 + }, + { + "epoch": 5.787234042553192, + "grad_norm": 0.920124351978302, + "learning_rate": 0.0002, + "loss": 1.193, + "step": 3400 + }, + { + "epoch": 5.804255319148936, + "grad_norm": 0.999706506729126, + "learning_rate": 0.0002, + "loss": 1.2263, + "step": 3410 + }, + { + "epoch": 5.821276595744681, + "grad_norm": 0.9292707443237305, + "learning_rate": 0.0002, + "loss": 1.1411, + "step": 3420 + }, + { + "epoch": 5.8382978723404255, + "grad_norm": 1.0074706077575684, + "learning_rate": 0.0002, + "loss": 1.1507, + "step": 3430 + }, + { + "epoch": 5.85531914893617, + "grad_norm": 1.0279479026794434, + "learning_rate": 0.0002, + "loss": 1.2709, + "step": 3440 + }, + { + "epoch": 5.872340425531915, + "grad_norm": 1.0026037693023682, + "learning_rate": 0.0002, + "loss": 1.1992, + "step": 3450 + }, + { + "epoch": 5.889361702127659, + "grad_norm": 1.0356525182724, + "learning_rate": 0.0002, + "loss": 1.1416, + "step": 3460 + }, + { + "epoch": 5.906382978723404, + "grad_norm": 1.1106643676757812, + "learning_rate": 0.0002, + "loss": 1.224, + "step": 3470 + }, + { + "epoch": 5.923404255319149, + "grad_norm": 0.9578408002853394, + "learning_rate": 0.0002, + "loss": 1.1955, + "step": 3480 + }, + { + "epoch": 5.940425531914894, + "grad_norm": 1.0225932598114014, + "learning_rate": 0.0002, + "loss": 1.2133, + "step": 3490 + }, + { + "epoch": 5.957446808510638, + "grad_norm": 0.9677667021751404, + "learning_rate": 0.0002, + "loss": 1.157, + "step": 3500 + }, + { + "epoch": 5.974468085106383, + "grad_norm": 1.0967241525650024, + "learning_rate": 0.0002, + "loss": 1.2196, + "step": 3510 + }, + { + "epoch": 5.991489361702127, + "grad_norm": 1.2497339248657227, + "learning_rate": 0.0002, + "loss": 1.1807, + "step": 3520 + }, + { + "epoch": 6.0, + "eval_loss": 2.0976572036743164, + "eval_runtime": 105.9679, + "eval_samples_per_second": 4.86, + "eval_steps_per_second": 0.613, + "step": 3525 + } + ], + "logging_steps": 10, + "max_steps": 4696, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.631291072053248e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..40bc5c2205bcd1402ce337f3a218ef4840b3870a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-3525/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:148f55f87c9f695bdf2cf6d54b37e690c06ff5da5e17bc3af6c1f44f90f45374 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..81b412d38f985ff160b354ff1ea3e86d8744ca1e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4e29a88e2747fada5e93c0d90f9dcbfed8f330e384c7f3572544143ea30399d +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f9f2065407e21dd09e01bd4467084516523bd54 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ecf469a90a038226fa25108592c7fd08200da73312feae16dcf456a78f2d8de +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ace4b26e36b2b7fbf0f6a1dccc2b3c996e92c424 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7377f20155e2cfc22fb937a3bd8b30d7e99de1db1b4080a5c1651e468d14cfb +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a2638c556e39a762b35e9570a6e41ddd3b335e9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5694797f54c3d94f6c31731e01caf1424cef8007bfcb5ff9930961eac38da1e +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2abe7ed8a08fb71063a29a70d8462f6b9f3f55e7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/trainer_state.json @@ -0,0 +1,2966 @@ +{ + "best_metric": 1.8328146934509277, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175", + "epoch": 6.999148936170212, + "eval_steps": 10, + "global_step": 4112, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01702127659574468, + "grad_norm": 0.7596228122711182, + "learning_rate": 0.0002, + "loss": 2.5996, + "step": 10 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 0.4860903322696686, + "learning_rate": 0.0002, + "loss": 2.2941, + "step": 20 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 0.4953401982784271, + "learning_rate": 0.0002, + "loss": 2.0018, + "step": 30 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 0.5086901783943176, + "learning_rate": 0.0002, + "loss": 1.9318, + "step": 40 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 0.49050021171569824, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 50 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 0.4922358989715576, + "learning_rate": 0.0002, + "loss": 1.8786, + "step": 60 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 0.4621541202068329, + "learning_rate": 0.0002, + "loss": 1.8812, + "step": 70 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 0.4416729807853699, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 80 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 0.526258111000061, + "learning_rate": 0.0002, + "loss": 1.9298, + "step": 90 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 0.44022637605667114, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 100 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 0.4647711515426636, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 110 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 0.4136318564414978, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 120 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 0.39707672595977783, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 130 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 0.4478105306625366, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 140 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 0.4699741303920746, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 150 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 0.4568363130092621, + "learning_rate": 0.0002, + "loss": 1.8161, + "step": 160 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 0.45078757405281067, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 170 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 0.4127245843410492, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 180 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 0.4042493402957916, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 190 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 0.401487797498703, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 200 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 0.3959457576274872, + "learning_rate": 0.0002, + "loss": 1.8355, + "step": 210 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 0.39865636825561523, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 220 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 0.7225169539451599, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 230 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 0.412801593542099, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 240 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 0.40951448678970337, + "learning_rate": 0.0002, + "loss": 1.8523, + "step": 250 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 0.42788130044937134, + "learning_rate": 0.0002, + "loss": 1.7283, + "step": 260 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 0.41069576144218445, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 270 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 0.3745323717594147, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 280 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 0.3771323263645172, + "learning_rate": 0.0002, + "loss": 1.8484, + "step": 290 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 0.34368929266929626, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 300 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 0.4299296736717224, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 310 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 0.4133922755718231, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 320 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 0.3984859585762024, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 330 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 0.3822788894176483, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 340 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 0.4550061821937561, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 350 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 0.36571192741394043, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 360 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 0.32942914962768555, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 370 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 0.39299526810646057, + "learning_rate": 0.0002, + "loss": 1.7118, + "step": 380 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 0.3817657232284546, + "learning_rate": 0.0002, + "loss": 1.8179, + "step": 390 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 0.3650810122489929, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 0.3736686408519745, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 410 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 0.45680564641952515, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 420 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 0.4154510200023651, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 430 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 0.3701167106628418, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 440 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 0.3869531750679016, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 450 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 0.4391495883464813, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 460 + }, + { + "epoch": 0.8, + "grad_norm": 0.39652755856513977, + "learning_rate": 0.0002, + "loss": 1.7534, + "step": 470 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 0.4096752107143402, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 480 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 0.3857504427433014, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 490 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 0.4105374217033386, + "learning_rate": 0.0002, + "loss": 1.7379, + "step": 500 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 0.3723328113555908, + "learning_rate": 0.0002, + "loss": 1.6391, + "step": 510 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 0.36099690198898315, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 520 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 0.3715187907218933, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 530 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 0.4932813048362732, + "learning_rate": 0.0002, + "loss": 1.7004, + "step": 540 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 0.3493495285511017, + "learning_rate": 0.0002, + "loss": 1.679, + "step": 550 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 0.3598061800003052, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 560 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 0.3521560728549957, + "learning_rate": 0.0002, + "loss": 1.7686, + "step": 570 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 0.34150034189224243, + "learning_rate": 0.0002, + "loss": 1.7391, + "step": 580 + }, + { + "epoch": 0.9991489361702127, + "eval_loss": 1.8388911485671997, + "eval_runtime": 106.6788, + "eval_samples_per_second": 4.828, + "eval_steps_per_second": 0.609, + "step": 587 + }, + { + "epoch": 1.004255319148936, + "grad_norm": 0.33556103706359863, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 590 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 0.4333398640155792, + "learning_rate": 0.0002, + "loss": 1.8346, + "step": 600 + }, + { + "epoch": 1.0382978723404255, + "grad_norm": 0.38488736748695374, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 610 + }, + { + "epoch": 1.0553191489361702, + "grad_norm": 0.44454529881477356, + "learning_rate": 0.0002, + "loss": 1.7778, + "step": 620 + }, + { + "epoch": 1.0723404255319149, + "grad_norm": 0.3735603392124176, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 630 + }, + { + "epoch": 1.0893617021276596, + "grad_norm": 0.38912704586982727, + "learning_rate": 0.0002, + "loss": 1.74, + "step": 640 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 0.4411826431751251, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 650 + }, + { + "epoch": 1.123404255319149, + "grad_norm": 0.4163050353527069, + "learning_rate": 0.0002, + "loss": 1.6901, + "step": 660 + }, + { + "epoch": 1.1404255319148937, + "grad_norm": 0.4187192916870117, + "learning_rate": 0.0002, + "loss": 1.6722, + "step": 670 + }, + { + "epoch": 1.1574468085106382, + "grad_norm": 0.3797093629837036, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 680 + }, + { + "epoch": 1.174468085106383, + "grad_norm": 0.4210026264190674, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 690 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 0.4701998829841614, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 700 + }, + { + "epoch": 1.2085106382978723, + "grad_norm": 0.6331578493118286, + "learning_rate": 0.0002, + "loss": 1.6773, + "step": 710 + }, + { + "epoch": 1.225531914893617, + "grad_norm": 0.41908255219459534, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 720 + }, + { + "epoch": 1.2425531914893617, + "grad_norm": 0.36158403754234314, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 730 + }, + { + "epoch": 1.2595744680851064, + "grad_norm": 0.387300580739975, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 740 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 0.38899728655815125, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 750 + }, + { + "epoch": 1.2936170212765958, + "grad_norm": 0.4549255073070526, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 760 + }, + { + "epoch": 1.3106382978723405, + "grad_norm": 0.4052349328994751, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 770 + }, + { + "epoch": 1.327659574468085, + "grad_norm": 0.38934215903282166, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 780 + }, + { + "epoch": 1.3446808510638297, + "grad_norm": 0.38688382506370544, + "learning_rate": 0.0002, + "loss": 1.7813, + "step": 790 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 0.3825705051422119, + "learning_rate": 0.0002, + "loss": 1.6374, + "step": 800 + }, + { + "epoch": 1.3787234042553191, + "grad_norm": 0.37331756949424744, + "learning_rate": 0.0002, + "loss": 1.655, + "step": 810 + }, + { + "epoch": 1.3957446808510638, + "grad_norm": 0.38826408982276917, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 820 + }, + { + "epoch": 1.4127659574468086, + "grad_norm": 0.4213569164276123, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 830 + }, + { + "epoch": 1.4297872340425533, + "grad_norm": 0.3976684808731079, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 840 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 0.390009343624115, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 850 + }, + { + "epoch": 1.4638297872340424, + "grad_norm": 0.4462052583694458, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 860 + }, + { + "epoch": 1.4808510638297872, + "grad_norm": 0.42129236459732056, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 870 + }, + { + "epoch": 1.4978723404255319, + "grad_norm": 0.41489893198013306, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 880 + }, + { + "epoch": 1.5148936170212766, + "grad_norm": 0.41451677680015564, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 890 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 0.4477299749851227, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 900 + }, + { + "epoch": 1.548936170212766, + "grad_norm": 0.38476648926734924, + "learning_rate": 0.0002, + "loss": 1.6543, + "step": 910 + }, + { + "epoch": 1.5659574468085107, + "grad_norm": 0.42755743861198425, + "learning_rate": 0.0002, + "loss": 1.7103, + "step": 920 + }, + { + "epoch": 1.5829787234042554, + "grad_norm": 0.39372023940086365, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 930 + }, + { + "epoch": 1.6, + "grad_norm": 0.42778754234313965, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 940 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 0.4217268228530884, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 950 + }, + { + "epoch": 1.6340425531914895, + "grad_norm": 0.40452107787132263, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 960 + }, + { + "epoch": 1.6510638297872342, + "grad_norm": 0.4259980022907257, + "learning_rate": 0.0002, + "loss": 1.6734, + "step": 970 + }, + { + "epoch": 1.6680851063829787, + "grad_norm": 0.4089849591255188, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 980 + }, + { + "epoch": 1.6851063829787234, + "grad_norm": 0.38276049494743347, + "learning_rate": 0.0002, + "loss": 1.6588, + "step": 990 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 0.40361565351486206, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 1000 + }, + { + "epoch": 1.7191489361702128, + "grad_norm": 0.3537807762622833, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 1010 + }, + { + "epoch": 1.7361702127659573, + "grad_norm": 0.40288347005844116, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 1020 + }, + { + "epoch": 1.753191489361702, + "grad_norm": 0.4003616273403168, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1030 + }, + { + "epoch": 1.7702127659574467, + "grad_norm": 0.3931669592857361, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 1040 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 0.4001635015010834, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1050 + }, + { + "epoch": 1.804255319148936, + "grad_norm": 0.4139048457145691, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 1060 + }, + { + "epoch": 1.8212765957446808, + "grad_norm": 0.5044458508491516, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 1070 + }, + { + "epoch": 1.8382978723404255, + "grad_norm": 0.4827095568180084, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 1080 + }, + { + "epoch": 1.8553191489361702, + "grad_norm": 0.3750515282154083, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 1090 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 0.4024597704410553, + "learning_rate": 0.0002, + "loss": 1.653, + "step": 1100 + }, + { + "epoch": 1.8893617021276596, + "grad_norm": 0.36747241020202637, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1110 + }, + { + "epoch": 1.9063829787234043, + "grad_norm": 0.41397711634635925, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 1120 + }, + { + "epoch": 1.923404255319149, + "grad_norm": 0.3960763216018677, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 1130 + }, + { + "epoch": 1.9404255319148938, + "grad_norm": 0.4533233344554901, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 1140 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 0.38433438539505005, + "learning_rate": 0.0002, + "loss": 1.7168, + "step": 1150 + }, + { + "epoch": 1.974468085106383, + "grad_norm": 0.3648812174797058, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1160 + }, + { + "epoch": 1.9914893617021276, + "grad_norm": 0.3887176215648651, + "learning_rate": 0.0002, + "loss": 1.6521, + "step": 1170 + }, + { + "epoch": 2.0, + "eval_loss": 1.8328146934509277, + "eval_runtime": 107.2842, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 0.606, + "step": 1175 + }, + { + "epoch": 2.008510638297872, + "grad_norm": 0.40444880723953247, + "learning_rate": 0.0002, + "loss": 1.6184, + "step": 1180 + }, + { + "epoch": 2.025531914893617, + "grad_norm": 0.3997816741466522, + "learning_rate": 0.0002, + "loss": 1.5221, + "step": 1190 + }, + { + "epoch": 2.0425531914893615, + "grad_norm": 0.4516718089580536, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 1200 + }, + { + "epoch": 2.0595744680851062, + "grad_norm": 0.6645553708076477, + "learning_rate": 0.0002, + "loss": 1.5458, + "step": 1210 + }, + { + "epoch": 2.076595744680851, + "grad_norm": 0.4181990921497345, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 1220 + }, + { + "epoch": 2.0936170212765957, + "grad_norm": 0.45681431889533997, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 1230 + }, + { + "epoch": 2.1106382978723404, + "grad_norm": 0.48914700746536255, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 1240 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 0.43265485763549805, + "learning_rate": 0.0002, + "loss": 1.6031, + "step": 1250 + }, + { + "epoch": 2.1446808510638298, + "grad_norm": 0.4641207754611969, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 1260 + }, + { + "epoch": 2.1617021276595745, + "grad_norm": 0.4840783476829529, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 1270 + }, + { + "epoch": 2.178723404255319, + "grad_norm": 0.4974595308303833, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 1280 + }, + { + "epoch": 2.195744680851064, + "grad_norm": 0.5133475661277771, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 1290 + }, + { + "epoch": 2.2127659574468086, + "grad_norm": 0.5030052065849304, + "learning_rate": 0.0002, + "loss": 1.6467, + "step": 1300 + }, + { + "epoch": 2.2297872340425533, + "grad_norm": 0.46602481603622437, + "learning_rate": 0.0002, + "loss": 1.6845, + "step": 1310 + }, + { + "epoch": 2.246808510638298, + "grad_norm": 0.43662378191947937, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1320 + }, + { + "epoch": 2.2638297872340427, + "grad_norm": 0.5137454867362976, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1330 + }, + { + "epoch": 2.2808510638297874, + "grad_norm": 0.4750335216522217, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 1340 + }, + { + "epoch": 2.297872340425532, + "grad_norm": 0.43691426515579224, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 1350 + }, + { + "epoch": 2.3148936170212764, + "grad_norm": 0.49752047657966614, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 1360 + }, + { + "epoch": 2.331914893617021, + "grad_norm": 0.45101815462112427, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 1370 + }, + { + "epoch": 2.348936170212766, + "grad_norm": 0.4427817761898041, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 1380 + }, + { + "epoch": 2.3659574468085105, + "grad_norm": 0.4802311062812805, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 1390 + }, + { + "epoch": 2.382978723404255, + "grad_norm": 0.4512513279914856, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 1400 + }, + { + "epoch": 2.4, + "grad_norm": 0.4878857135772705, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 1410 + }, + { + "epoch": 2.4170212765957446, + "grad_norm": 0.4741315543651581, + "learning_rate": 0.0002, + "loss": 1.5781, + "step": 1420 + }, + { + "epoch": 2.4340425531914893, + "grad_norm": 0.4770931601524353, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 1430 + }, + { + "epoch": 2.451063829787234, + "grad_norm": 0.5124667286872864, + "learning_rate": 0.0002, + "loss": 1.5336, + "step": 1440 + }, + { + "epoch": 2.4680851063829787, + "grad_norm": 0.45264801383018494, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 1450 + }, + { + "epoch": 2.4851063829787234, + "grad_norm": 0.5456924438476562, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 1460 + }, + { + "epoch": 2.502127659574468, + "grad_norm": 0.44656285643577576, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 1470 + }, + { + "epoch": 2.519148936170213, + "grad_norm": 0.5939419865608215, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 1480 + }, + { + "epoch": 2.5361702127659576, + "grad_norm": 0.47853362560272217, + "learning_rate": 0.0002, + "loss": 1.5481, + "step": 1490 + }, + { + "epoch": 2.5531914893617023, + "grad_norm": 0.47643396258354187, + "learning_rate": 0.0002, + "loss": 1.6543, + "step": 1500 + }, + { + "epoch": 2.570212765957447, + "grad_norm": 0.4939501881599426, + "learning_rate": 0.0002, + "loss": 1.5085, + "step": 1510 + }, + { + "epoch": 2.5872340425531917, + "grad_norm": 0.502055287361145, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 1520 + }, + { + "epoch": 2.604255319148936, + "grad_norm": 0.463250994682312, + "learning_rate": 0.0002, + "loss": 1.6519, + "step": 1530 + }, + { + "epoch": 2.621276595744681, + "grad_norm": 0.4761098623275757, + "learning_rate": 0.0002, + "loss": 1.5698, + "step": 1540 + }, + { + "epoch": 2.6382978723404253, + "grad_norm": 0.4687299132347107, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 1550 + }, + { + "epoch": 2.65531914893617, + "grad_norm": 0.5536078810691833, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1560 + }, + { + "epoch": 2.6723404255319148, + "grad_norm": 0.581320583820343, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 1570 + }, + { + "epoch": 2.6893617021276595, + "grad_norm": 0.45952868461608887, + "learning_rate": 0.0002, + "loss": 1.508, + "step": 1580 + }, + { + "epoch": 2.706382978723404, + "grad_norm": 0.4602586328983307, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 1590 + }, + { + "epoch": 2.723404255319149, + "grad_norm": 0.5276554226875305, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 1600 + }, + { + "epoch": 2.7404255319148936, + "grad_norm": 0.5750249624252319, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 1610 + }, + { + "epoch": 2.7574468085106383, + "grad_norm": 0.468723863363266, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 1620 + }, + { + "epoch": 2.774468085106383, + "grad_norm": 0.44649943709373474, + "learning_rate": 0.0002, + "loss": 1.4499, + "step": 1630 + }, + { + "epoch": 2.7914893617021277, + "grad_norm": 0.5097237825393677, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 1640 + }, + { + "epoch": 2.8085106382978724, + "grad_norm": 0.46384191513061523, + "learning_rate": 0.0002, + "loss": 1.5948, + "step": 1650 + }, + { + "epoch": 2.825531914893617, + "grad_norm": 0.4885474443435669, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 1660 + }, + { + "epoch": 2.842553191489362, + "grad_norm": 0.45621681213378906, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 1670 + }, + { + "epoch": 2.8595744680851065, + "grad_norm": 0.4797150194644928, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 1680 + }, + { + "epoch": 2.876595744680851, + "grad_norm": 0.5142032504081726, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 1690 + }, + { + "epoch": 2.8936170212765955, + "grad_norm": 0.48939862847328186, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 1700 + }, + { + "epoch": 2.9106382978723406, + "grad_norm": 0.4575578272342682, + "learning_rate": 0.0002, + "loss": 1.6333, + "step": 1710 + }, + { + "epoch": 2.927659574468085, + "grad_norm": 0.5589063763618469, + "learning_rate": 0.0002, + "loss": 1.5396, + "step": 1720 + }, + { + "epoch": 2.94468085106383, + "grad_norm": 0.48508813977241516, + "learning_rate": 0.0002, + "loss": 1.6096, + "step": 1730 + }, + { + "epoch": 2.9617021276595743, + "grad_norm": 0.42786726355552673, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 1740 + }, + { + "epoch": 2.978723404255319, + "grad_norm": 0.5598229765892029, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1750 + }, + { + "epoch": 2.9957446808510637, + "grad_norm": 0.4779253602027893, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 1760 + }, + { + "epoch": 2.999148936170213, + "eval_loss": 1.8543579578399658, + "eval_runtime": 107.2363, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 1762 + }, + { + "epoch": 3.0127659574468084, + "grad_norm": 0.48810940980911255, + "learning_rate": 0.0002, + "loss": 1.4767, + "step": 1770 + }, + { + "epoch": 3.029787234042553, + "grad_norm": 0.6194920539855957, + "learning_rate": 0.0002, + "loss": 1.5385, + "step": 1780 + }, + { + "epoch": 3.046808510638298, + "grad_norm": 0.5875462293624878, + "learning_rate": 0.0002, + "loss": 1.4012, + "step": 1790 + }, + { + "epoch": 3.0638297872340425, + "grad_norm": 0.5775138139724731, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 1800 + }, + { + "epoch": 3.0808510638297872, + "grad_norm": 0.5445981621742249, + "learning_rate": 0.0002, + "loss": 1.493, + "step": 1810 + }, + { + "epoch": 3.097872340425532, + "grad_norm": 0.6728862524032593, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 1820 + }, + { + "epoch": 3.1148936170212767, + "grad_norm": 0.6105490326881409, + "learning_rate": 0.0002, + "loss": 1.4303, + "step": 1830 + }, + { + "epoch": 3.1319148936170214, + "grad_norm": 0.5771165490150452, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 1840 + }, + { + "epoch": 3.148936170212766, + "grad_norm": 0.5778449773788452, + "learning_rate": 0.0002, + "loss": 1.4359, + "step": 1850 + }, + { + "epoch": 3.1659574468085108, + "grad_norm": 0.7141990661621094, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 1860 + }, + { + "epoch": 3.1829787234042555, + "grad_norm": 0.5882705450057983, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 1870 + }, + { + "epoch": 3.2, + "grad_norm": 0.5996195077896118, + "learning_rate": 0.0002, + "loss": 1.4941, + "step": 1880 + }, + { + "epoch": 3.217021276595745, + "grad_norm": 0.6121219396591187, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 1890 + }, + { + "epoch": 3.2340425531914896, + "grad_norm": 0.6402981281280518, + "learning_rate": 0.0002, + "loss": 1.4586, + "step": 1900 + }, + { + "epoch": 3.251063829787234, + "grad_norm": 0.6111783981323242, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 1910 + }, + { + "epoch": 3.2680851063829786, + "grad_norm": 0.6682435274124146, + "learning_rate": 0.0002, + "loss": 1.4863, + "step": 1920 + }, + { + "epoch": 3.2851063829787233, + "grad_norm": 0.6530760526657104, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 1930 + }, + { + "epoch": 3.302127659574468, + "grad_norm": 0.6481217741966248, + "learning_rate": 0.0002, + "loss": 1.4422, + "step": 1940 + }, + { + "epoch": 3.3191489361702127, + "grad_norm": 0.6270697116851807, + "learning_rate": 0.0002, + "loss": 1.5158, + "step": 1950 + }, + { + "epoch": 3.3361702127659574, + "grad_norm": 0.5924492478370667, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 1960 + }, + { + "epoch": 3.353191489361702, + "grad_norm": 0.5803806781768799, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 1970 + }, + { + "epoch": 3.370212765957447, + "grad_norm": 0.5754119157791138, + "learning_rate": 0.0002, + "loss": 1.4689, + "step": 1980 + }, + { + "epoch": 3.3872340425531915, + "grad_norm": 0.6717178821563721, + "learning_rate": 0.0002, + "loss": 1.4605, + "step": 1990 + }, + { + "epoch": 3.404255319148936, + "grad_norm": 0.5955582857131958, + "learning_rate": 0.0002, + "loss": 1.486, + "step": 2000 + }, + { + "epoch": 3.421276595744681, + "grad_norm": 0.6965329647064209, + "learning_rate": 0.0002, + "loss": 1.4445, + "step": 2010 + }, + { + "epoch": 3.4382978723404256, + "grad_norm": 0.6321573257446289, + "learning_rate": 0.0002, + "loss": 1.4543, + "step": 2020 + }, + { + "epoch": 3.4553191489361703, + "grad_norm": 0.5952608585357666, + "learning_rate": 0.0002, + "loss": 1.5383, + "step": 2030 + }, + { + "epoch": 3.472340425531915, + "grad_norm": 0.7718905806541443, + "learning_rate": 0.0002, + "loss": 1.4531, + "step": 2040 + }, + { + "epoch": 3.4893617021276597, + "grad_norm": 0.6850892305374146, + "learning_rate": 0.0002, + "loss": 1.4678, + "step": 2050 + }, + { + "epoch": 3.506382978723404, + "grad_norm": 0.5638895630836487, + "learning_rate": 0.0002, + "loss": 1.4956, + "step": 2060 + }, + { + "epoch": 3.523404255319149, + "grad_norm": 0.6148294806480408, + "learning_rate": 0.0002, + "loss": 1.4586, + "step": 2070 + }, + { + "epoch": 3.5404255319148934, + "grad_norm": 0.5895810723304749, + "learning_rate": 0.0002, + "loss": 1.4622, + "step": 2080 + }, + { + "epoch": 3.5574468085106385, + "grad_norm": 0.6377319693565369, + "learning_rate": 0.0002, + "loss": 1.4341, + "step": 2090 + }, + { + "epoch": 3.574468085106383, + "grad_norm": 0.6047691702842712, + "learning_rate": 0.0002, + "loss": 1.5056, + "step": 2100 + }, + { + "epoch": 3.5914893617021275, + "grad_norm": 0.6049593687057495, + "learning_rate": 0.0002, + "loss": 1.4748, + "step": 2110 + }, + { + "epoch": 3.608510638297872, + "grad_norm": 0.6358312368392944, + "learning_rate": 0.0002, + "loss": 1.391, + "step": 2120 + }, + { + "epoch": 3.625531914893617, + "grad_norm": 0.612119197845459, + "learning_rate": 0.0002, + "loss": 1.4419, + "step": 2130 + }, + { + "epoch": 3.6425531914893616, + "grad_norm": 0.6788054704666138, + "learning_rate": 0.0002, + "loss": 1.438, + "step": 2140 + }, + { + "epoch": 3.6595744680851063, + "grad_norm": 0.6191043853759766, + "learning_rate": 0.0002, + "loss": 1.4295, + "step": 2150 + }, + { + "epoch": 3.676595744680851, + "grad_norm": 0.6660051941871643, + "learning_rate": 0.0002, + "loss": 1.4383, + "step": 2160 + }, + { + "epoch": 3.6936170212765957, + "grad_norm": 0.652692973613739, + "learning_rate": 0.0002, + "loss": 1.4954, + "step": 2170 + }, + { + "epoch": 3.7106382978723405, + "grad_norm": 0.6123467087745667, + "learning_rate": 0.0002, + "loss": 1.5245, + "step": 2180 + }, + { + "epoch": 3.727659574468085, + "grad_norm": 0.640021562576294, + "learning_rate": 0.0002, + "loss": 1.4686, + "step": 2190 + }, + { + "epoch": 3.74468085106383, + "grad_norm": 0.6809179782867432, + "learning_rate": 0.0002, + "loss": 1.4277, + "step": 2200 + }, + { + "epoch": 3.7617021276595746, + "grad_norm": 0.5978420376777649, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 2210 + }, + { + "epoch": 3.7787234042553193, + "grad_norm": 0.7038803100585938, + "learning_rate": 0.0002, + "loss": 1.5559, + "step": 2220 + }, + { + "epoch": 3.795744680851064, + "grad_norm": 0.5324276089668274, + "learning_rate": 0.0002, + "loss": 1.4691, + "step": 2230 + }, + { + "epoch": 3.8127659574468087, + "grad_norm": 0.6264132857322693, + "learning_rate": 0.0002, + "loss": 1.4696, + "step": 2240 + }, + { + "epoch": 3.829787234042553, + "grad_norm": 0.6143888831138611, + "learning_rate": 0.0002, + "loss": 1.4856, + "step": 2250 + }, + { + "epoch": 3.846808510638298, + "grad_norm": 0.6338503360748291, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 2260 + }, + { + "epoch": 3.8638297872340424, + "grad_norm": 0.556882381439209, + "learning_rate": 0.0002, + "loss": 1.456, + "step": 2270 + }, + { + "epoch": 3.8808510638297875, + "grad_norm": 0.6323680281639099, + "learning_rate": 0.0002, + "loss": 1.4701, + "step": 2280 + }, + { + "epoch": 3.8978723404255318, + "grad_norm": 0.7105869054794312, + "learning_rate": 0.0002, + "loss": 1.5333, + "step": 2290 + }, + { + "epoch": 3.9148936170212765, + "grad_norm": 0.825415849685669, + "learning_rate": 0.0002, + "loss": 1.4462, + "step": 2300 + }, + { + "epoch": 3.931914893617021, + "grad_norm": 0.6412091851234436, + "learning_rate": 0.0002, + "loss": 1.5023, + "step": 2310 + }, + { + "epoch": 3.948936170212766, + "grad_norm": 0.6286490559577942, + "learning_rate": 0.0002, + "loss": 1.3709, + "step": 2320 + }, + { + "epoch": 3.9659574468085106, + "grad_norm": 0.636021077632904, + "learning_rate": 0.0002, + "loss": 1.4693, + "step": 2330 + }, + { + "epoch": 3.9829787234042553, + "grad_norm": 0.6032362580299377, + "learning_rate": 0.0002, + "loss": 1.4265, + "step": 2340 + }, + { + "epoch": 4.0, + "grad_norm": 0.6497282385826111, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 2350 + }, + { + "epoch": 4.0, + "eval_loss": 1.9081238508224487, + "eval_runtime": 106.6404, + "eval_samples_per_second": 4.829, + "eval_steps_per_second": 0.61, + "step": 2350 + }, + { + "epoch": 4.017021276595744, + "grad_norm": 0.6278848648071289, + "learning_rate": 0.0002, + "loss": 1.317, + "step": 2360 + }, + { + "epoch": 4.034042553191489, + "grad_norm": 0.8259812593460083, + "learning_rate": 0.0002, + "loss": 1.3229, + "step": 2370 + }, + { + "epoch": 4.051063829787234, + "grad_norm": 0.7269589304924011, + "learning_rate": 0.0002, + "loss": 1.2776, + "step": 2380 + }, + { + "epoch": 4.068085106382979, + "grad_norm": 0.7460662126541138, + "learning_rate": 0.0002, + "loss": 1.3668, + "step": 2390 + }, + { + "epoch": 4.085106382978723, + "grad_norm": 1.2362046241760254, + "learning_rate": 0.0002, + "loss": 1.3096, + "step": 2400 + }, + { + "epoch": 4.102127659574468, + "grad_norm": 0.7699568867683411, + "learning_rate": 0.0002, + "loss": 1.2906, + "step": 2410 + }, + { + "epoch": 4.1191489361702125, + "grad_norm": 0.8732489347457886, + "learning_rate": 0.0002, + "loss": 1.3005, + "step": 2420 + }, + { + "epoch": 4.136170212765958, + "grad_norm": 0.8331889510154724, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 2430 + }, + { + "epoch": 4.153191489361702, + "grad_norm": 0.6686427593231201, + "learning_rate": 0.0002, + "loss": 1.1861, + "step": 2440 + }, + { + "epoch": 4.170212765957447, + "grad_norm": 0.906380832195282, + "learning_rate": 0.0002, + "loss": 1.316, + "step": 2450 + }, + { + "epoch": 4.187234042553191, + "grad_norm": 0.7269753813743591, + "learning_rate": 0.0002, + "loss": 1.3134, + "step": 2460 + }, + { + "epoch": 4.2042553191489365, + "grad_norm": 0.8556067943572998, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 2470 + }, + { + "epoch": 4.221276595744681, + "grad_norm": 0.7076917886734009, + "learning_rate": 0.0002, + "loss": 1.2935, + "step": 2480 + }, + { + "epoch": 4.238297872340426, + "grad_norm": 0.7596837282180786, + "learning_rate": 0.0002, + "loss": 1.2608, + "step": 2490 + }, + { + "epoch": 4.25531914893617, + "grad_norm": 0.7790552377700806, + "learning_rate": 0.0002, + "loss": 1.2747, + "step": 2500 + }, + { + "epoch": 4.272340425531915, + "grad_norm": 0.8205534219741821, + "learning_rate": 0.0002, + "loss": 1.3438, + "step": 2510 + }, + { + "epoch": 4.2893617021276595, + "grad_norm": 0.7892114520072937, + "learning_rate": 0.0002, + "loss": 1.3058, + "step": 2520 + }, + { + "epoch": 4.306382978723404, + "grad_norm": 0.8907270431518555, + "learning_rate": 0.0002, + "loss": 1.3662, + "step": 2530 + }, + { + "epoch": 4.323404255319149, + "grad_norm": 0.821794331073761, + "learning_rate": 0.0002, + "loss": 1.3168, + "step": 2540 + }, + { + "epoch": 4.340425531914893, + "grad_norm": 0.7305247783660889, + "learning_rate": 0.0002, + "loss": 1.2467, + "step": 2550 + }, + { + "epoch": 4.357446808510638, + "grad_norm": 0.8639982342720032, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 2560 + }, + { + "epoch": 4.374468085106383, + "grad_norm": 0.8883494138717651, + "learning_rate": 0.0002, + "loss": 1.3863, + "step": 2570 + }, + { + "epoch": 4.391489361702128, + "grad_norm": 0.7611730098724365, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 2580 + }, + { + "epoch": 4.408510638297872, + "grad_norm": 0.7793022394180298, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 2590 + }, + { + "epoch": 4.425531914893617, + "grad_norm": 0.979060173034668, + "learning_rate": 0.0002, + "loss": 1.3014, + "step": 2600 + }, + { + "epoch": 4.4425531914893615, + "grad_norm": 0.8320847749710083, + "learning_rate": 0.0002, + "loss": 1.3625, + "step": 2610 + }, + { + "epoch": 4.459574468085107, + "grad_norm": 0.7481992244720459, + "learning_rate": 0.0002, + "loss": 1.3362, + "step": 2620 + }, + { + "epoch": 4.476595744680851, + "grad_norm": 0.783770740032196, + "learning_rate": 0.0002, + "loss": 1.4037, + "step": 2630 + }, + { + "epoch": 4.493617021276596, + "grad_norm": 0.773295521736145, + "learning_rate": 0.0002, + "loss": 1.3049, + "step": 2640 + }, + { + "epoch": 4.51063829787234, + "grad_norm": 0.9206840991973877, + "learning_rate": 0.0002, + "loss": 1.2739, + "step": 2650 + }, + { + "epoch": 4.527659574468085, + "grad_norm": 0.8803266882896423, + "learning_rate": 0.0002, + "loss": 1.3248, + "step": 2660 + }, + { + "epoch": 4.54468085106383, + "grad_norm": 0.9315535426139832, + "learning_rate": 0.0002, + "loss": 1.2898, + "step": 2670 + }, + { + "epoch": 4.561702127659575, + "grad_norm": 0.8610678315162659, + "learning_rate": 0.0002, + "loss": 1.316, + "step": 2680 + }, + { + "epoch": 4.578723404255319, + "grad_norm": 0.7405551671981812, + "learning_rate": 0.0002, + "loss": 1.2633, + "step": 2690 + }, + { + "epoch": 4.595744680851064, + "grad_norm": 1.0238394737243652, + "learning_rate": 0.0002, + "loss": 1.3136, + "step": 2700 + }, + { + "epoch": 4.6127659574468085, + "grad_norm": 0.7814345955848694, + "learning_rate": 0.0002, + "loss": 1.4847, + "step": 2710 + }, + { + "epoch": 4.629787234042553, + "grad_norm": 0.8436329364776611, + "learning_rate": 0.0002, + "loss": 1.295, + "step": 2720 + }, + { + "epoch": 4.646808510638298, + "grad_norm": 0.727214515209198, + "learning_rate": 0.0002, + "loss": 1.3525, + "step": 2730 + }, + { + "epoch": 4.663829787234042, + "grad_norm": 0.8465878367424011, + "learning_rate": 0.0002, + "loss": 1.3878, + "step": 2740 + }, + { + "epoch": 4.680851063829787, + "grad_norm": 0.8218137621879578, + "learning_rate": 0.0002, + "loss": 1.278, + "step": 2750 + }, + { + "epoch": 4.697872340425532, + "grad_norm": 0.7900442481040955, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 2760 + }, + { + "epoch": 4.714893617021277, + "grad_norm": 0.8214074969291687, + "learning_rate": 0.0002, + "loss": 1.3494, + "step": 2770 + }, + { + "epoch": 4.731914893617021, + "grad_norm": 0.7509574890136719, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 2780 + }, + { + "epoch": 4.748936170212766, + "grad_norm": 0.7416139245033264, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 2790 + }, + { + "epoch": 4.76595744680851, + "grad_norm": 0.8629977107048035, + "learning_rate": 0.0002, + "loss": 1.3045, + "step": 2800 + }, + { + "epoch": 4.782978723404256, + "grad_norm": 0.8056505918502808, + "learning_rate": 0.0002, + "loss": 1.3164, + "step": 2810 + }, + { + "epoch": 4.8, + "grad_norm": 0.7705401182174683, + "learning_rate": 0.0002, + "loss": 1.3056, + "step": 2820 + }, + { + "epoch": 4.817021276595745, + "grad_norm": 1.0173288583755493, + "learning_rate": 0.0002, + "loss": 1.3771, + "step": 2830 + }, + { + "epoch": 4.834042553191489, + "grad_norm": 0.8375823497772217, + "learning_rate": 0.0002, + "loss": 1.3494, + "step": 2840 + }, + { + "epoch": 4.851063829787234, + "grad_norm": 0.857073187828064, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 2850 + }, + { + "epoch": 4.868085106382979, + "grad_norm": 0.8672189712524414, + "learning_rate": 0.0002, + "loss": 1.2964, + "step": 2860 + }, + { + "epoch": 4.885106382978723, + "grad_norm": 0.8599910140037537, + "learning_rate": 0.0002, + "loss": 1.3646, + "step": 2870 + }, + { + "epoch": 4.902127659574468, + "grad_norm": 0.8844674229621887, + "learning_rate": 0.0002, + "loss": 1.3575, + "step": 2880 + }, + { + "epoch": 4.919148936170213, + "grad_norm": 0.8246751427650452, + "learning_rate": 0.0002, + "loss": 1.285, + "step": 2890 + }, + { + "epoch": 4.9361702127659575, + "grad_norm": 0.8648163676261902, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 2900 + }, + { + "epoch": 4.953191489361702, + "grad_norm": 0.9477900266647339, + "learning_rate": 0.0002, + "loss": 1.2614, + "step": 2910 + }, + { + "epoch": 4.970212765957447, + "grad_norm": 0.8047965168952942, + "learning_rate": 0.0002, + "loss": 1.3519, + "step": 2920 + }, + { + "epoch": 4.987234042553191, + "grad_norm": 0.9872494339942932, + "learning_rate": 0.0002, + "loss": 1.3889, + "step": 2930 + }, + { + "epoch": 4.999148936170212, + "eval_loss": 1.9836769104003906, + "eval_runtime": 106.4655, + "eval_samples_per_second": 4.837, + "eval_steps_per_second": 0.611, + "step": 2937 + }, + { + "epoch": 5.004255319148936, + "grad_norm": 0.7292938828468323, + "learning_rate": 0.0002, + "loss": 1.2574, + "step": 2940 + }, + { + "epoch": 5.0212765957446805, + "grad_norm": 0.8610548973083496, + "learning_rate": 0.0002, + "loss": 1.1312, + "step": 2950 + }, + { + "epoch": 5.038297872340426, + "grad_norm": 0.8384576439857483, + "learning_rate": 0.0002, + "loss": 1.1105, + "step": 2960 + }, + { + "epoch": 5.05531914893617, + "grad_norm": 0.9746620059013367, + "learning_rate": 0.0002, + "loss": 1.1412, + "step": 2970 + }, + { + "epoch": 5.072340425531915, + "grad_norm": 0.8879048228263855, + "learning_rate": 0.0002, + "loss": 1.1687, + "step": 2980 + }, + { + "epoch": 5.089361702127659, + "grad_norm": 0.9006168246269226, + "learning_rate": 0.0002, + "loss": 1.1333, + "step": 2990 + }, + { + "epoch": 5.1063829787234045, + "grad_norm": 0.9770249128341675, + "learning_rate": 0.0002, + "loss": 1.1659, + "step": 3000 + }, + { + "epoch": 5.123404255319149, + "grad_norm": 1.267967939376831, + "learning_rate": 0.0002, + "loss": 1.1334, + "step": 3010 + }, + { + "epoch": 5.140425531914894, + "grad_norm": 0.9857587218284607, + "learning_rate": 0.0002, + "loss": 1.2095, + "step": 3020 + }, + { + "epoch": 5.157446808510638, + "grad_norm": 1.2938690185546875, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 3030 + }, + { + "epoch": 5.174468085106383, + "grad_norm": 0.8928244113922119, + "learning_rate": 0.0002, + "loss": 1.1645, + "step": 3040 + }, + { + "epoch": 5.191489361702128, + "grad_norm": 1.1087630987167358, + "learning_rate": 0.0002, + "loss": 1.1553, + "step": 3050 + }, + { + "epoch": 5.208510638297873, + "grad_norm": 0.9431360960006714, + "learning_rate": 0.0002, + "loss": 1.1416, + "step": 3060 + }, + { + "epoch": 5.225531914893617, + "grad_norm": 1.2048338651657104, + "learning_rate": 0.0002, + "loss": 1.1635, + "step": 3070 + }, + { + "epoch": 5.242553191489361, + "grad_norm": 1.0017054080963135, + "learning_rate": 0.0002, + "loss": 1.171, + "step": 3080 + }, + { + "epoch": 5.259574468085106, + "grad_norm": 1.2771434783935547, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 3090 + }, + { + "epoch": 5.276595744680851, + "grad_norm": 1.4307383298873901, + "learning_rate": 0.0002, + "loss": 1.1478, + "step": 3100 + }, + { + "epoch": 5.293617021276596, + "grad_norm": 1.2460752725601196, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 3110 + }, + { + "epoch": 5.31063829787234, + "grad_norm": 1.693974494934082, + "learning_rate": 0.0002, + "loss": 1.235, + "step": 3120 + }, + { + "epoch": 5.327659574468085, + "grad_norm": 0.9855408668518066, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 3130 + }, + { + "epoch": 5.3446808510638295, + "grad_norm": 1.307521104812622, + "learning_rate": 0.0002, + "loss": 1.2068, + "step": 3140 + }, + { + "epoch": 5.361702127659575, + "grad_norm": 0.957661509513855, + "learning_rate": 0.0002, + "loss": 1.2144, + "step": 3150 + }, + { + "epoch": 5.378723404255319, + "grad_norm": 0.870373010635376, + "learning_rate": 0.0002, + "loss": 1.1305, + "step": 3160 + }, + { + "epoch": 5.395744680851064, + "grad_norm": 0.9324309229850769, + "learning_rate": 0.0002, + "loss": 1.2196, + "step": 3170 + }, + { + "epoch": 5.412765957446808, + "grad_norm": 1.0142403841018677, + "learning_rate": 0.0002, + "loss": 1.1691, + "step": 3180 + }, + { + "epoch": 5.4297872340425535, + "grad_norm": 0.9759578704833984, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 3190 + }, + { + "epoch": 5.446808510638298, + "grad_norm": 0.9021993279457092, + "learning_rate": 0.0002, + "loss": 1.1321, + "step": 3200 + }, + { + "epoch": 5.463829787234043, + "grad_norm": 1.007728934288025, + "learning_rate": 0.0002, + "loss": 1.2222, + "step": 3210 + }, + { + "epoch": 5.480851063829787, + "grad_norm": 0.8969265222549438, + "learning_rate": 0.0002, + "loss": 1.1517, + "step": 3220 + }, + { + "epoch": 5.497872340425532, + "grad_norm": 0.9672483801841736, + "learning_rate": 0.0002, + "loss": 1.2061, + "step": 3230 + }, + { + "epoch": 5.514893617021277, + "grad_norm": 1.1417138576507568, + "learning_rate": 0.0002, + "loss": 1.1454, + "step": 3240 + }, + { + "epoch": 5.531914893617021, + "grad_norm": 0.9669530391693115, + "learning_rate": 0.0002, + "loss": 1.1871, + "step": 3250 + }, + { + "epoch": 5.548936170212766, + "grad_norm": 1.0161820650100708, + "learning_rate": 0.0002, + "loss": 1.1382, + "step": 3260 + }, + { + "epoch": 5.565957446808511, + "grad_norm": 0.9935774803161621, + "learning_rate": 0.0002, + "loss": 1.1708, + "step": 3270 + }, + { + "epoch": 5.582978723404255, + "grad_norm": 1.2572048902511597, + "learning_rate": 0.0002, + "loss": 1.1384, + "step": 3280 + }, + { + "epoch": 5.6, + "grad_norm": 0.9614662528038025, + "learning_rate": 0.0002, + "loss": 1.1711, + "step": 3290 + }, + { + "epoch": 5.617021276595745, + "grad_norm": 0.9835584163665771, + "learning_rate": 0.0002, + "loss": 1.219, + "step": 3300 + }, + { + "epoch": 5.634042553191489, + "grad_norm": 0.9387389421463013, + "learning_rate": 0.0002, + "loss": 1.2074, + "step": 3310 + }, + { + "epoch": 5.651063829787234, + "grad_norm": 0.9348428249359131, + "learning_rate": 0.0002, + "loss": 1.1148, + "step": 3320 + }, + { + "epoch": 5.6680851063829785, + "grad_norm": 0.9636440873146057, + "learning_rate": 0.0002, + "loss": 1.2378, + "step": 3330 + }, + { + "epoch": 5.685106382978724, + "grad_norm": 0.995894193649292, + "learning_rate": 0.0002, + "loss": 1.2068, + "step": 3340 + }, + { + "epoch": 5.702127659574468, + "grad_norm": 1.0357023477554321, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 3350 + }, + { + "epoch": 5.719148936170213, + "grad_norm": 1.0254428386688232, + "learning_rate": 0.0002, + "loss": 1.2209, + "step": 3360 + }, + { + "epoch": 5.736170212765957, + "grad_norm": 0.8993342518806458, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 3370 + }, + { + "epoch": 5.753191489361702, + "grad_norm": 0.9104585647583008, + "learning_rate": 0.0002, + "loss": 1.1527, + "step": 3380 + }, + { + "epoch": 5.770212765957447, + "grad_norm": 0.9555654525756836, + "learning_rate": 0.0002, + "loss": 1.2268, + "step": 3390 + }, + { + "epoch": 5.787234042553192, + "grad_norm": 0.920124351978302, + "learning_rate": 0.0002, + "loss": 1.193, + "step": 3400 + }, + { + "epoch": 5.804255319148936, + "grad_norm": 0.999706506729126, + "learning_rate": 0.0002, + "loss": 1.2263, + "step": 3410 + }, + { + "epoch": 5.821276595744681, + "grad_norm": 0.9292707443237305, + "learning_rate": 0.0002, + "loss": 1.1411, + "step": 3420 + }, + { + "epoch": 5.8382978723404255, + "grad_norm": 1.0074706077575684, + "learning_rate": 0.0002, + "loss": 1.1507, + "step": 3430 + }, + { + "epoch": 5.85531914893617, + "grad_norm": 1.0279479026794434, + "learning_rate": 0.0002, + "loss": 1.2709, + "step": 3440 + }, + { + "epoch": 5.872340425531915, + "grad_norm": 1.0026037693023682, + "learning_rate": 0.0002, + "loss": 1.1992, + "step": 3450 + }, + { + "epoch": 5.889361702127659, + "grad_norm": 1.0356525182724, + "learning_rate": 0.0002, + "loss": 1.1416, + "step": 3460 + }, + { + "epoch": 5.906382978723404, + "grad_norm": 1.1106643676757812, + "learning_rate": 0.0002, + "loss": 1.224, + "step": 3470 + }, + { + "epoch": 5.923404255319149, + "grad_norm": 0.9578408002853394, + "learning_rate": 0.0002, + "loss": 1.1955, + "step": 3480 + }, + { + "epoch": 5.940425531914894, + "grad_norm": 1.0225932598114014, + "learning_rate": 0.0002, + "loss": 1.2133, + "step": 3490 + }, + { + "epoch": 5.957446808510638, + "grad_norm": 0.9677667021751404, + "learning_rate": 0.0002, + "loss": 1.157, + "step": 3500 + }, + { + "epoch": 5.974468085106383, + "grad_norm": 1.0967241525650024, + "learning_rate": 0.0002, + "loss": 1.2196, + "step": 3510 + }, + { + "epoch": 5.991489361702127, + "grad_norm": 1.2497339248657227, + "learning_rate": 0.0002, + "loss": 1.1807, + "step": 3520 + }, + { + "epoch": 6.0, + "eval_loss": 2.0976572036743164, + "eval_runtime": 105.9679, + "eval_samples_per_second": 4.86, + "eval_steps_per_second": 0.613, + "step": 3525 + }, + { + "epoch": 6.008510638297873, + "grad_norm": 0.9660930037498474, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 3530 + }, + { + "epoch": 6.025531914893617, + "grad_norm": 0.9462300539016724, + "learning_rate": 0.0002, + "loss": 1.0043, + "step": 3540 + }, + { + "epoch": 6.042553191489362, + "grad_norm": 0.9312542676925659, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 3550 + }, + { + "epoch": 6.059574468085106, + "grad_norm": 1.3502222299575806, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 3560 + }, + { + "epoch": 6.076595744680851, + "grad_norm": 1.2838709354400635, + "learning_rate": 0.0002, + "loss": 0.9167, + "step": 3570 + }, + { + "epoch": 6.093617021276596, + "grad_norm": 1.1399385929107666, + "learning_rate": 0.0002, + "loss": 0.9381, + "step": 3580 + }, + { + "epoch": 6.110638297872341, + "grad_norm": 1.1763123273849487, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 3590 + }, + { + "epoch": 6.127659574468085, + "grad_norm": 1.113002061843872, + "learning_rate": 0.0002, + "loss": 0.9782, + "step": 3600 + }, + { + "epoch": 6.14468085106383, + "grad_norm": 1.0322953462600708, + "learning_rate": 0.0002, + "loss": 0.9521, + "step": 3610 + }, + { + "epoch": 6.1617021276595745, + "grad_norm": 1.2678894996643066, + "learning_rate": 0.0002, + "loss": 0.9114, + "step": 3620 + }, + { + "epoch": 6.178723404255319, + "grad_norm": 1.2370864152908325, + "learning_rate": 0.0002, + "loss": 0.9934, + "step": 3630 + }, + { + "epoch": 6.195744680851064, + "grad_norm": 1.1930763721466064, + "learning_rate": 0.0002, + "loss": 0.9753, + "step": 3640 + }, + { + "epoch": 6.212765957446808, + "grad_norm": 1.3608582019805908, + "learning_rate": 0.0002, + "loss": 0.9448, + "step": 3650 + }, + { + "epoch": 6.229787234042553, + "grad_norm": 1.2158547639846802, + "learning_rate": 0.0002, + "loss": 1.0201, + "step": 3660 + }, + { + "epoch": 6.246808510638298, + "grad_norm": 1.1505420207977295, + "learning_rate": 0.0002, + "loss": 0.9896, + "step": 3670 + }, + { + "epoch": 6.263829787234043, + "grad_norm": 1.3038114309310913, + "learning_rate": 0.0002, + "loss": 1.0088, + "step": 3680 + }, + { + "epoch": 6.280851063829787, + "grad_norm": 1.3900057077407837, + "learning_rate": 0.0002, + "loss": 1.0416, + "step": 3690 + }, + { + "epoch": 6.297872340425532, + "grad_norm": 1.196964144706726, + "learning_rate": 0.0002, + "loss": 0.9832, + "step": 3700 + }, + { + "epoch": 6.314893617021276, + "grad_norm": 1.205865740776062, + "learning_rate": 0.0002, + "loss": 1.0778, + "step": 3710 + }, + { + "epoch": 6.3319148936170215, + "grad_norm": 1.2710838317871094, + "learning_rate": 0.0002, + "loss": 1.0358, + "step": 3720 + }, + { + "epoch": 6.348936170212766, + "grad_norm": 1.285942554473877, + "learning_rate": 0.0002, + "loss": 1.0271, + "step": 3730 + }, + { + "epoch": 6.365957446808511, + "grad_norm": 1.1717636585235596, + "learning_rate": 0.0002, + "loss": 1.0164, + "step": 3740 + }, + { + "epoch": 6.382978723404255, + "grad_norm": 1.190883994102478, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 3750 + }, + { + "epoch": 6.4, + "grad_norm": 1.1623435020446777, + "learning_rate": 0.0002, + "loss": 1.0319, + "step": 3760 + }, + { + "epoch": 6.417021276595745, + "grad_norm": 1.2285547256469727, + "learning_rate": 0.0002, + "loss": 1.0633, + "step": 3770 + }, + { + "epoch": 6.43404255319149, + "grad_norm": 1.1142666339874268, + "learning_rate": 0.0002, + "loss": 1.0593, + "step": 3780 + }, + { + "epoch": 6.451063829787234, + "grad_norm": 1.333337664604187, + "learning_rate": 0.0002, + "loss": 1.0418, + "step": 3790 + }, + { + "epoch": 6.468085106382979, + "grad_norm": 1.350474238395691, + "learning_rate": 0.0002, + "loss": 1.0, + "step": 3800 + }, + { + "epoch": 6.485106382978723, + "grad_norm": 1.2439061403274536, + "learning_rate": 0.0002, + "loss": 1.1152, + "step": 3810 + }, + { + "epoch": 6.502127659574468, + "grad_norm": 1.2488664388656616, + "learning_rate": 0.0002, + "loss": 1.0915, + "step": 3820 + }, + { + "epoch": 6.519148936170213, + "grad_norm": 1.1990735530853271, + "learning_rate": 0.0002, + "loss": 1.0571, + "step": 3830 + }, + { + "epoch": 6.536170212765957, + "grad_norm": 1.5180301666259766, + "learning_rate": 0.0002, + "loss": 0.9895, + "step": 3840 + }, + { + "epoch": 6.553191489361702, + "grad_norm": 1.1273280382156372, + "learning_rate": 0.0002, + "loss": 0.9955, + "step": 3850 + }, + { + "epoch": 6.5702127659574465, + "grad_norm": 1.2778105735778809, + "learning_rate": 0.0002, + "loss": 1.0516, + "step": 3860 + }, + { + "epoch": 6.587234042553192, + "grad_norm": 1.1789685487747192, + "learning_rate": 0.0002, + "loss": 1.0039, + "step": 3870 + }, + { + "epoch": 6.604255319148936, + "grad_norm": 1.2061398029327393, + "learning_rate": 0.0002, + "loss": 1.0381, + "step": 3880 + }, + { + "epoch": 6.621276595744681, + "grad_norm": 1.104092001914978, + "learning_rate": 0.0002, + "loss": 1.0775, + "step": 3890 + }, + { + "epoch": 6.638297872340425, + "grad_norm": 1.2648544311523438, + "learning_rate": 0.0002, + "loss": 1.0591, + "step": 3900 + }, + { + "epoch": 6.6553191489361705, + "grad_norm": 1.2267687320709229, + "learning_rate": 0.0002, + "loss": 1.0535, + "step": 3910 + }, + { + "epoch": 6.672340425531915, + "grad_norm": 1.3252530097961426, + "learning_rate": 0.0002, + "loss": 1.0654, + "step": 3920 + }, + { + "epoch": 6.68936170212766, + "grad_norm": 1.284563660621643, + "learning_rate": 0.0002, + "loss": 1.0301, + "step": 3930 + }, + { + "epoch": 6.706382978723404, + "grad_norm": 1.293845534324646, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 3940 + }, + { + "epoch": 6.723404255319149, + "grad_norm": 1.2290467023849487, + "learning_rate": 0.0002, + "loss": 1.1526, + "step": 3950 + }, + { + "epoch": 6.740425531914894, + "grad_norm": 1.1712737083435059, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 3960 + }, + { + "epoch": 6.757446808510638, + "grad_norm": 1.1728616952896118, + "learning_rate": 0.0002, + "loss": 1.0149, + "step": 3970 + }, + { + "epoch": 6.774468085106383, + "grad_norm": 1.154922604560852, + "learning_rate": 0.0002, + "loss": 1.0824, + "step": 3980 + }, + { + "epoch": 6.791489361702128, + "grad_norm": 1.4673690795898438, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 3990 + }, + { + "epoch": 6.808510638297872, + "grad_norm": 1.2338067293167114, + "learning_rate": 0.0002, + "loss": 0.9784, + "step": 4000 + }, + { + "epoch": 6.825531914893617, + "grad_norm": 1.0775316953659058, + "learning_rate": 0.0002, + "loss": 1.0975, + "step": 4010 + }, + { + "epoch": 6.842553191489362, + "grad_norm": 1.2518454790115356, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 4020 + }, + { + "epoch": 6.859574468085106, + "grad_norm": 1.3534432649612427, + "learning_rate": 0.0002, + "loss": 1.1425, + "step": 4030 + }, + { + "epoch": 6.876595744680851, + "grad_norm": 1.1217902898788452, + "learning_rate": 0.0002, + "loss": 1.1212, + "step": 4040 + }, + { + "epoch": 6.8936170212765955, + "grad_norm": 1.2672910690307617, + "learning_rate": 0.0002, + "loss": 1.0823, + "step": 4050 + }, + { + "epoch": 6.910638297872341, + "grad_norm": 1.3807674646377563, + "learning_rate": 0.0002, + "loss": 1.0817, + "step": 4060 + }, + { + "epoch": 6.927659574468085, + "grad_norm": 1.064530849456787, + "learning_rate": 0.0002, + "loss": 1.0576, + "step": 4070 + }, + { + "epoch": 6.94468085106383, + "grad_norm": 1.1286897659301758, + "learning_rate": 0.0002, + "loss": 1.0718, + "step": 4080 + }, + { + "epoch": 6.961702127659574, + "grad_norm": 1.3736463785171509, + "learning_rate": 0.0002, + "loss": 1.0574, + "step": 4090 + }, + { + "epoch": 6.9787234042553195, + "grad_norm": 1.3167431354522705, + "learning_rate": 0.0002, + "loss": 1.0621, + "step": 4100 + }, + { + "epoch": 6.995744680851064, + "grad_norm": 1.2784067392349243, + "learning_rate": 0.0002, + "loss": 0.9754, + "step": 4110 + }, + { + "epoch": 6.999148936170212, + "eval_loss": 2.260930299758911, + "eval_runtime": 106.0392, + "eval_samples_per_second": 4.857, + "eval_steps_per_second": 0.613, + "step": 4112 + } + ], + "logging_steps": 10, + "max_steps": 4696, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.903172917395456e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..40bc5c2205bcd1402ce337f3a218ef4840b3870a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4112/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:148f55f87c9f695bdf2cf6d54b37e690c06ff5da5e17bc3af6c1f44f90f45374 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..36f3418fd92653c5bfbe46216e9a4aa2a5170f24 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e93223481785fa25d0a525488dabf358051dc151821695796004015038600a73 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e87889e3f167ee9ae328fa9504f812469ffaa28 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2b42701cb5222fb0915e60db8f749d08c9773a03511fdf0bec89333ae9801a8 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e672959575b952b90672989a204f834e1e36696a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44f37b79e9cf3ae9a8dbc8552b998f34a4ee300833552aaf83ce75e9622876d7 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c97413654a1ebcecbb748cc51e6361d31d098941 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35da6fe41d64ba89d40de63a966aab9f45d211dadcf513e2840f775048f0d92e +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..33ae2c89a45864378927bb9632d6486787136846 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/trainer_state.json @@ -0,0 +1,3380 @@ +{ + "best_metric": 1.8328146934509277, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175", + "epoch": 7.993191489361702, + "eval_steps": 10, + "global_step": 4696, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01702127659574468, + "grad_norm": 0.7596228122711182, + "learning_rate": 0.0002, + "loss": 2.5996, + "step": 10 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 0.4860903322696686, + "learning_rate": 0.0002, + "loss": 2.2941, + "step": 20 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 0.4953401982784271, + "learning_rate": 0.0002, + "loss": 2.0018, + "step": 30 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 0.5086901783943176, + "learning_rate": 0.0002, + "loss": 1.9318, + "step": 40 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 0.49050021171569824, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 50 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 0.4922358989715576, + "learning_rate": 0.0002, + "loss": 1.8786, + "step": 60 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 0.4621541202068329, + "learning_rate": 0.0002, + "loss": 1.8812, + "step": 70 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 0.4416729807853699, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 80 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 0.526258111000061, + "learning_rate": 0.0002, + "loss": 1.9298, + "step": 90 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 0.44022637605667114, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 100 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 0.4647711515426636, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 110 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 0.4136318564414978, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 120 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 0.39707672595977783, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 130 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 0.4478105306625366, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 140 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 0.4699741303920746, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 150 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 0.4568363130092621, + "learning_rate": 0.0002, + "loss": 1.8161, + "step": 160 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 0.45078757405281067, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 170 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 0.4127245843410492, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 180 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 0.4042493402957916, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 190 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 0.401487797498703, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 200 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 0.3959457576274872, + "learning_rate": 0.0002, + "loss": 1.8355, + "step": 210 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 0.39865636825561523, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 220 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 0.7225169539451599, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 230 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 0.412801593542099, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 240 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 0.40951448678970337, + "learning_rate": 0.0002, + "loss": 1.8523, + "step": 250 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 0.42788130044937134, + "learning_rate": 0.0002, + "loss": 1.7283, + "step": 260 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 0.41069576144218445, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 270 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 0.3745323717594147, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 280 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 0.3771323263645172, + "learning_rate": 0.0002, + "loss": 1.8484, + "step": 290 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 0.34368929266929626, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 300 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 0.4299296736717224, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 310 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 0.4133922755718231, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 320 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 0.3984859585762024, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 330 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 0.3822788894176483, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 340 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 0.4550061821937561, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 350 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 0.36571192741394043, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 360 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 0.32942914962768555, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 370 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 0.39299526810646057, + "learning_rate": 0.0002, + "loss": 1.7118, + "step": 380 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 0.3817657232284546, + "learning_rate": 0.0002, + "loss": 1.8179, + "step": 390 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 0.3650810122489929, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 0.3736686408519745, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 410 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 0.45680564641952515, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 420 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 0.4154510200023651, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 430 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 0.3701167106628418, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 440 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 0.3869531750679016, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 450 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 0.4391495883464813, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 460 + }, + { + "epoch": 0.8, + "grad_norm": 0.39652755856513977, + "learning_rate": 0.0002, + "loss": 1.7534, + "step": 470 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 0.4096752107143402, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 480 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 0.3857504427433014, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 490 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 0.4105374217033386, + "learning_rate": 0.0002, + "loss": 1.7379, + "step": 500 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 0.3723328113555908, + "learning_rate": 0.0002, + "loss": 1.6391, + "step": 510 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 0.36099690198898315, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 520 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 0.3715187907218933, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 530 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 0.4932813048362732, + "learning_rate": 0.0002, + "loss": 1.7004, + "step": 540 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 0.3493495285511017, + "learning_rate": 0.0002, + "loss": 1.679, + "step": 550 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 0.3598061800003052, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 560 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 0.3521560728549957, + "learning_rate": 0.0002, + "loss": 1.7686, + "step": 570 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 0.34150034189224243, + "learning_rate": 0.0002, + "loss": 1.7391, + "step": 580 + }, + { + "epoch": 0.9991489361702127, + "eval_loss": 1.8388911485671997, + "eval_runtime": 106.6788, + "eval_samples_per_second": 4.828, + "eval_steps_per_second": 0.609, + "step": 587 + }, + { + "epoch": 1.004255319148936, + "grad_norm": 0.33556103706359863, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 590 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 0.4333398640155792, + "learning_rate": 0.0002, + "loss": 1.8346, + "step": 600 + }, + { + "epoch": 1.0382978723404255, + "grad_norm": 0.38488736748695374, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 610 + }, + { + "epoch": 1.0553191489361702, + "grad_norm": 0.44454529881477356, + "learning_rate": 0.0002, + "loss": 1.7778, + "step": 620 + }, + { + "epoch": 1.0723404255319149, + "grad_norm": 0.3735603392124176, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 630 + }, + { + "epoch": 1.0893617021276596, + "grad_norm": 0.38912704586982727, + "learning_rate": 0.0002, + "loss": 1.74, + "step": 640 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 0.4411826431751251, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 650 + }, + { + "epoch": 1.123404255319149, + "grad_norm": 0.4163050353527069, + "learning_rate": 0.0002, + "loss": 1.6901, + "step": 660 + }, + { + "epoch": 1.1404255319148937, + "grad_norm": 0.4187192916870117, + "learning_rate": 0.0002, + "loss": 1.6722, + "step": 670 + }, + { + "epoch": 1.1574468085106382, + "grad_norm": 0.3797093629837036, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 680 + }, + { + "epoch": 1.174468085106383, + "grad_norm": 0.4210026264190674, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 690 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 0.4701998829841614, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 700 + }, + { + "epoch": 1.2085106382978723, + "grad_norm": 0.6331578493118286, + "learning_rate": 0.0002, + "loss": 1.6773, + "step": 710 + }, + { + "epoch": 1.225531914893617, + "grad_norm": 0.41908255219459534, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 720 + }, + { + "epoch": 1.2425531914893617, + "grad_norm": 0.36158403754234314, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 730 + }, + { + "epoch": 1.2595744680851064, + "grad_norm": 0.387300580739975, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 740 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 0.38899728655815125, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 750 + }, + { + "epoch": 1.2936170212765958, + "grad_norm": 0.4549255073070526, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 760 + }, + { + "epoch": 1.3106382978723405, + "grad_norm": 0.4052349328994751, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 770 + }, + { + "epoch": 1.327659574468085, + "grad_norm": 0.38934215903282166, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 780 + }, + { + "epoch": 1.3446808510638297, + "grad_norm": 0.38688382506370544, + "learning_rate": 0.0002, + "loss": 1.7813, + "step": 790 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 0.3825705051422119, + "learning_rate": 0.0002, + "loss": 1.6374, + "step": 800 + }, + { + "epoch": 1.3787234042553191, + "grad_norm": 0.37331756949424744, + "learning_rate": 0.0002, + "loss": 1.655, + "step": 810 + }, + { + "epoch": 1.3957446808510638, + "grad_norm": 0.38826408982276917, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 820 + }, + { + "epoch": 1.4127659574468086, + "grad_norm": 0.4213569164276123, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 830 + }, + { + "epoch": 1.4297872340425533, + "grad_norm": 0.3976684808731079, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 840 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 0.390009343624115, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 850 + }, + { + "epoch": 1.4638297872340424, + "grad_norm": 0.4462052583694458, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 860 + }, + { + "epoch": 1.4808510638297872, + "grad_norm": 0.42129236459732056, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 870 + }, + { + "epoch": 1.4978723404255319, + "grad_norm": 0.41489893198013306, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 880 + }, + { + "epoch": 1.5148936170212766, + "grad_norm": 0.41451677680015564, + "learning_rate": 0.0002, + "loss": 1.7129, + "step": 890 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 0.4477299749851227, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 900 + }, + { + "epoch": 1.548936170212766, + "grad_norm": 0.38476648926734924, + "learning_rate": 0.0002, + "loss": 1.6543, + "step": 910 + }, + { + "epoch": 1.5659574468085107, + "grad_norm": 0.42755743861198425, + "learning_rate": 0.0002, + "loss": 1.7103, + "step": 920 + }, + { + "epoch": 1.5829787234042554, + "grad_norm": 0.39372023940086365, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 930 + }, + { + "epoch": 1.6, + "grad_norm": 0.42778754234313965, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 940 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 0.4217268228530884, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 950 + }, + { + "epoch": 1.6340425531914895, + "grad_norm": 0.40452107787132263, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 960 + }, + { + "epoch": 1.6510638297872342, + "grad_norm": 0.4259980022907257, + "learning_rate": 0.0002, + "loss": 1.6734, + "step": 970 + }, + { + "epoch": 1.6680851063829787, + "grad_norm": 0.4089849591255188, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 980 + }, + { + "epoch": 1.6851063829787234, + "grad_norm": 0.38276049494743347, + "learning_rate": 0.0002, + "loss": 1.6588, + "step": 990 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 0.40361565351486206, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 1000 + }, + { + "epoch": 1.7191489361702128, + "grad_norm": 0.3537807762622833, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 1010 + }, + { + "epoch": 1.7361702127659573, + "grad_norm": 0.40288347005844116, + "learning_rate": 0.0002, + "loss": 1.6001, + "step": 1020 + }, + { + "epoch": 1.753191489361702, + "grad_norm": 0.4003616273403168, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1030 + }, + { + "epoch": 1.7702127659574467, + "grad_norm": 0.3931669592857361, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 1040 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 0.4001635015010834, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1050 + }, + { + "epoch": 1.804255319148936, + "grad_norm": 0.4139048457145691, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 1060 + }, + { + "epoch": 1.8212765957446808, + "grad_norm": 0.5044458508491516, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 1070 + }, + { + "epoch": 1.8382978723404255, + "grad_norm": 0.4827095568180084, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 1080 + }, + { + "epoch": 1.8553191489361702, + "grad_norm": 0.3750515282154083, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 1090 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 0.4024597704410553, + "learning_rate": 0.0002, + "loss": 1.653, + "step": 1100 + }, + { + "epoch": 1.8893617021276596, + "grad_norm": 0.36747241020202637, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1110 + }, + { + "epoch": 1.9063829787234043, + "grad_norm": 0.41397711634635925, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 1120 + }, + { + "epoch": 1.923404255319149, + "grad_norm": 0.3960763216018677, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 1130 + }, + { + "epoch": 1.9404255319148938, + "grad_norm": 0.4533233344554901, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 1140 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 0.38433438539505005, + "learning_rate": 0.0002, + "loss": 1.7168, + "step": 1150 + }, + { + "epoch": 1.974468085106383, + "grad_norm": 0.3648812174797058, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1160 + }, + { + "epoch": 1.9914893617021276, + "grad_norm": 0.3887176215648651, + "learning_rate": 0.0002, + "loss": 1.6521, + "step": 1170 + }, + { + "epoch": 2.0, + "eval_loss": 1.8328146934509277, + "eval_runtime": 107.2842, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 0.606, + "step": 1175 + }, + { + "epoch": 2.008510638297872, + "grad_norm": 0.40444880723953247, + "learning_rate": 0.0002, + "loss": 1.6184, + "step": 1180 + }, + { + "epoch": 2.025531914893617, + "grad_norm": 0.3997816741466522, + "learning_rate": 0.0002, + "loss": 1.5221, + "step": 1190 + }, + { + "epoch": 2.0425531914893615, + "grad_norm": 0.4516718089580536, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 1200 + }, + { + "epoch": 2.0595744680851062, + "grad_norm": 0.6645553708076477, + "learning_rate": 0.0002, + "loss": 1.5458, + "step": 1210 + }, + { + "epoch": 2.076595744680851, + "grad_norm": 0.4181990921497345, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 1220 + }, + { + "epoch": 2.0936170212765957, + "grad_norm": 0.45681431889533997, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 1230 + }, + { + "epoch": 2.1106382978723404, + "grad_norm": 0.48914700746536255, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 1240 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 0.43265485763549805, + "learning_rate": 0.0002, + "loss": 1.6031, + "step": 1250 + }, + { + "epoch": 2.1446808510638298, + "grad_norm": 0.4641207754611969, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 1260 + }, + { + "epoch": 2.1617021276595745, + "grad_norm": 0.4840783476829529, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 1270 + }, + { + "epoch": 2.178723404255319, + "grad_norm": 0.4974595308303833, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 1280 + }, + { + "epoch": 2.195744680851064, + "grad_norm": 0.5133475661277771, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 1290 + }, + { + "epoch": 2.2127659574468086, + "grad_norm": 0.5030052065849304, + "learning_rate": 0.0002, + "loss": 1.6467, + "step": 1300 + }, + { + "epoch": 2.2297872340425533, + "grad_norm": 0.46602481603622437, + "learning_rate": 0.0002, + "loss": 1.6845, + "step": 1310 + }, + { + "epoch": 2.246808510638298, + "grad_norm": 0.43662378191947937, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1320 + }, + { + "epoch": 2.2638297872340427, + "grad_norm": 0.5137454867362976, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1330 + }, + { + "epoch": 2.2808510638297874, + "grad_norm": 0.4750335216522217, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 1340 + }, + { + "epoch": 2.297872340425532, + "grad_norm": 0.43691426515579224, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 1350 + }, + { + "epoch": 2.3148936170212764, + "grad_norm": 0.49752047657966614, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 1360 + }, + { + "epoch": 2.331914893617021, + "grad_norm": 0.45101815462112427, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 1370 + }, + { + "epoch": 2.348936170212766, + "grad_norm": 0.4427817761898041, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 1380 + }, + { + "epoch": 2.3659574468085105, + "grad_norm": 0.4802311062812805, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 1390 + }, + { + "epoch": 2.382978723404255, + "grad_norm": 0.4512513279914856, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 1400 + }, + { + "epoch": 2.4, + "grad_norm": 0.4878857135772705, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 1410 + }, + { + "epoch": 2.4170212765957446, + "grad_norm": 0.4741315543651581, + "learning_rate": 0.0002, + "loss": 1.5781, + "step": 1420 + }, + { + "epoch": 2.4340425531914893, + "grad_norm": 0.4770931601524353, + "learning_rate": 0.0002, + "loss": 1.5595, + "step": 1430 + }, + { + "epoch": 2.451063829787234, + "grad_norm": 0.5124667286872864, + "learning_rate": 0.0002, + "loss": 1.5336, + "step": 1440 + }, + { + "epoch": 2.4680851063829787, + "grad_norm": 0.45264801383018494, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 1450 + }, + { + "epoch": 2.4851063829787234, + "grad_norm": 0.5456924438476562, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 1460 + }, + { + "epoch": 2.502127659574468, + "grad_norm": 0.44656285643577576, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 1470 + }, + { + "epoch": 2.519148936170213, + "grad_norm": 0.5939419865608215, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 1480 + }, + { + "epoch": 2.5361702127659576, + "grad_norm": 0.47853362560272217, + "learning_rate": 0.0002, + "loss": 1.5481, + "step": 1490 + }, + { + "epoch": 2.5531914893617023, + "grad_norm": 0.47643396258354187, + "learning_rate": 0.0002, + "loss": 1.6543, + "step": 1500 + }, + { + "epoch": 2.570212765957447, + "grad_norm": 0.4939501881599426, + "learning_rate": 0.0002, + "loss": 1.5085, + "step": 1510 + }, + { + "epoch": 2.5872340425531917, + "grad_norm": 0.502055287361145, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 1520 + }, + { + "epoch": 2.604255319148936, + "grad_norm": 0.463250994682312, + "learning_rate": 0.0002, + "loss": 1.6519, + "step": 1530 + }, + { + "epoch": 2.621276595744681, + "grad_norm": 0.4761098623275757, + "learning_rate": 0.0002, + "loss": 1.5698, + "step": 1540 + }, + { + "epoch": 2.6382978723404253, + "grad_norm": 0.4687299132347107, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 1550 + }, + { + "epoch": 2.65531914893617, + "grad_norm": 0.5536078810691833, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1560 + }, + { + "epoch": 2.6723404255319148, + "grad_norm": 0.581320583820343, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 1570 + }, + { + "epoch": 2.6893617021276595, + "grad_norm": 0.45952868461608887, + "learning_rate": 0.0002, + "loss": 1.508, + "step": 1580 + }, + { + "epoch": 2.706382978723404, + "grad_norm": 0.4602586328983307, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 1590 + }, + { + "epoch": 2.723404255319149, + "grad_norm": 0.5276554226875305, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 1600 + }, + { + "epoch": 2.7404255319148936, + "grad_norm": 0.5750249624252319, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 1610 + }, + { + "epoch": 2.7574468085106383, + "grad_norm": 0.468723863363266, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 1620 + }, + { + "epoch": 2.774468085106383, + "grad_norm": 0.44649943709373474, + "learning_rate": 0.0002, + "loss": 1.4499, + "step": 1630 + }, + { + "epoch": 2.7914893617021277, + "grad_norm": 0.5097237825393677, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 1640 + }, + { + "epoch": 2.8085106382978724, + "grad_norm": 0.46384191513061523, + "learning_rate": 0.0002, + "loss": 1.5948, + "step": 1650 + }, + { + "epoch": 2.825531914893617, + "grad_norm": 0.4885474443435669, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 1660 + }, + { + "epoch": 2.842553191489362, + "grad_norm": 0.45621681213378906, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 1670 + }, + { + "epoch": 2.8595744680851065, + "grad_norm": 0.4797150194644928, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 1680 + }, + { + "epoch": 2.876595744680851, + "grad_norm": 0.5142032504081726, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 1690 + }, + { + "epoch": 2.8936170212765955, + "grad_norm": 0.48939862847328186, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 1700 + }, + { + "epoch": 2.9106382978723406, + "grad_norm": 0.4575578272342682, + "learning_rate": 0.0002, + "loss": 1.6333, + "step": 1710 + }, + { + "epoch": 2.927659574468085, + "grad_norm": 0.5589063763618469, + "learning_rate": 0.0002, + "loss": 1.5396, + "step": 1720 + }, + { + "epoch": 2.94468085106383, + "grad_norm": 0.48508813977241516, + "learning_rate": 0.0002, + "loss": 1.6096, + "step": 1730 + }, + { + "epoch": 2.9617021276595743, + "grad_norm": 0.42786726355552673, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 1740 + }, + { + "epoch": 2.978723404255319, + "grad_norm": 0.5598229765892029, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1750 + }, + { + "epoch": 2.9957446808510637, + "grad_norm": 0.4779253602027893, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 1760 + }, + { + "epoch": 2.999148936170213, + "eval_loss": 1.8543579578399658, + "eval_runtime": 107.2363, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 1762 + }, + { + "epoch": 3.0127659574468084, + "grad_norm": 0.48810940980911255, + "learning_rate": 0.0002, + "loss": 1.4767, + "step": 1770 + }, + { + "epoch": 3.029787234042553, + "grad_norm": 0.6194920539855957, + "learning_rate": 0.0002, + "loss": 1.5385, + "step": 1780 + }, + { + "epoch": 3.046808510638298, + "grad_norm": 0.5875462293624878, + "learning_rate": 0.0002, + "loss": 1.4012, + "step": 1790 + }, + { + "epoch": 3.0638297872340425, + "grad_norm": 0.5775138139724731, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 1800 + }, + { + "epoch": 3.0808510638297872, + "grad_norm": 0.5445981621742249, + "learning_rate": 0.0002, + "loss": 1.493, + "step": 1810 + }, + { + "epoch": 3.097872340425532, + "grad_norm": 0.6728862524032593, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 1820 + }, + { + "epoch": 3.1148936170212767, + "grad_norm": 0.6105490326881409, + "learning_rate": 0.0002, + "loss": 1.4303, + "step": 1830 + }, + { + "epoch": 3.1319148936170214, + "grad_norm": 0.5771165490150452, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 1840 + }, + { + "epoch": 3.148936170212766, + "grad_norm": 0.5778449773788452, + "learning_rate": 0.0002, + "loss": 1.4359, + "step": 1850 + }, + { + "epoch": 3.1659574468085108, + "grad_norm": 0.7141990661621094, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 1860 + }, + { + "epoch": 3.1829787234042555, + "grad_norm": 0.5882705450057983, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 1870 + }, + { + "epoch": 3.2, + "grad_norm": 0.5996195077896118, + "learning_rate": 0.0002, + "loss": 1.4941, + "step": 1880 + }, + { + "epoch": 3.217021276595745, + "grad_norm": 0.6121219396591187, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 1890 + }, + { + "epoch": 3.2340425531914896, + "grad_norm": 0.6402981281280518, + "learning_rate": 0.0002, + "loss": 1.4586, + "step": 1900 + }, + { + "epoch": 3.251063829787234, + "grad_norm": 0.6111783981323242, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 1910 + }, + { + "epoch": 3.2680851063829786, + "grad_norm": 0.6682435274124146, + "learning_rate": 0.0002, + "loss": 1.4863, + "step": 1920 + }, + { + "epoch": 3.2851063829787233, + "grad_norm": 0.6530760526657104, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 1930 + }, + { + "epoch": 3.302127659574468, + "grad_norm": 0.6481217741966248, + "learning_rate": 0.0002, + "loss": 1.4422, + "step": 1940 + }, + { + "epoch": 3.3191489361702127, + "grad_norm": 0.6270697116851807, + "learning_rate": 0.0002, + "loss": 1.5158, + "step": 1950 + }, + { + "epoch": 3.3361702127659574, + "grad_norm": 0.5924492478370667, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 1960 + }, + { + "epoch": 3.353191489361702, + "grad_norm": 0.5803806781768799, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 1970 + }, + { + "epoch": 3.370212765957447, + "grad_norm": 0.5754119157791138, + "learning_rate": 0.0002, + "loss": 1.4689, + "step": 1980 + }, + { + "epoch": 3.3872340425531915, + "grad_norm": 0.6717178821563721, + "learning_rate": 0.0002, + "loss": 1.4605, + "step": 1990 + }, + { + "epoch": 3.404255319148936, + "grad_norm": 0.5955582857131958, + "learning_rate": 0.0002, + "loss": 1.486, + "step": 2000 + }, + { + "epoch": 3.421276595744681, + "grad_norm": 0.6965329647064209, + "learning_rate": 0.0002, + "loss": 1.4445, + "step": 2010 + }, + { + "epoch": 3.4382978723404256, + "grad_norm": 0.6321573257446289, + "learning_rate": 0.0002, + "loss": 1.4543, + "step": 2020 + }, + { + "epoch": 3.4553191489361703, + "grad_norm": 0.5952608585357666, + "learning_rate": 0.0002, + "loss": 1.5383, + "step": 2030 + }, + { + "epoch": 3.472340425531915, + "grad_norm": 0.7718905806541443, + "learning_rate": 0.0002, + "loss": 1.4531, + "step": 2040 + }, + { + "epoch": 3.4893617021276597, + "grad_norm": 0.6850892305374146, + "learning_rate": 0.0002, + "loss": 1.4678, + "step": 2050 + }, + { + "epoch": 3.506382978723404, + "grad_norm": 0.5638895630836487, + "learning_rate": 0.0002, + "loss": 1.4956, + "step": 2060 + }, + { + "epoch": 3.523404255319149, + "grad_norm": 0.6148294806480408, + "learning_rate": 0.0002, + "loss": 1.4586, + "step": 2070 + }, + { + "epoch": 3.5404255319148934, + "grad_norm": 0.5895810723304749, + "learning_rate": 0.0002, + "loss": 1.4622, + "step": 2080 + }, + { + "epoch": 3.5574468085106385, + "grad_norm": 0.6377319693565369, + "learning_rate": 0.0002, + "loss": 1.4341, + "step": 2090 + }, + { + "epoch": 3.574468085106383, + "grad_norm": 0.6047691702842712, + "learning_rate": 0.0002, + "loss": 1.5056, + "step": 2100 + }, + { + "epoch": 3.5914893617021275, + "grad_norm": 0.6049593687057495, + "learning_rate": 0.0002, + "loss": 1.4748, + "step": 2110 + }, + { + "epoch": 3.608510638297872, + "grad_norm": 0.6358312368392944, + "learning_rate": 0.0002, + "loss": 1.391, + "step": 2120 + }, + { + "epoch": 3.625531914893617, + "grad_norm": 0.612119197845459, + "learning_rate": 0.0002, + "loss": 1.4419, + "step": 2130 + }, + { + "epoch": 3.6425531914893616, + "grad_norm": 0.6788054704666138, + "learning_rate": 0.0002, + "loss": 1.438, + "step": 2140 + }, + { + "epoch": 3.6595744680851063, + "grad_norm": 0.6191043853759766, + "learning_rate": 0.0002, + "loss": 1.4295, + "step": 2150 + }, + { + "epoch": 3.676595744680851, + "grad_norm": 0.6660051941871643, + "learning_rate": 0.0002, + "loss": 1.4383, + "step": 2160 + }, + { + "epoch": 3.6936170212765957, + "grad_norm": 0.652692973613739, + "learning_rate": 0.0002, + "loss": 1.4954, + "step": 2170 + }, + { + "epoch": 3.7106382978723405, + "grad_norm": 0.6123467087745667, + "learning_rate": 0.0002, + "loss": 1.5245, + "step": 2180 + }, + { + "epoch": 3.727659574468085, + "grad_norm": 0.640021562576294, + "learning_rate": 0.0002, + "loss": 1.4686, + "step": 2190 + }, + { + "epoch": 3.74468085106383, + "grad_norm": 0.6809179782867432, + "learning_rate": 0.0002, + "loss": 1.4277, + "step": 2200 + }, + { + "epoch": 3.7617021276595746, + "grad_norm": 0.5978420376777649, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 2210 + }, + { + "epoch": 3.7787234042553193, + "grad_norm": 0.7038803100585938, + "learning_rate": 0.0002, + "loss": 1.5559, + "step": 2220 + }, + { + "epoch": 3.795744680851064, + "grad_norm": 0.5324276089668274, + "learning_rate": 0.0002, + "loss": 1.4691, + "step": 2230 + }, + { + "epoch": 3.8127659574468087, + "grad_norm": 0.6264132857322693, + "learning_rate": 0.0002, + "loss": 1.4696, + "step": 2240 + }, + { + "epoch": 3.829787234042553, + "grad_norm": 0.6143888831138611, + "learning_rate": 0.0002, + "loss": 1.4856, + "step": 2250 + }, + { + "epoch": 3.846808510638298, + "grad_norm": 0.6338503360748291, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 2260 + }, + { + "epoch": 3.8638297872340424, + "grad_norm": 0.556882381439209, + "learning_rate": 0.0002, + "loss": 1.456, + "step": 2270 + }, + { + "epoch": 3.8808510638297875, + "grad_norm": 0.6323680281639099, + "learning_rate": 0.0002, + "loss": 1.4701, + "step": 2280 + }, + { + "epoch": 3.8978723404255318, + "grad_norm": 0.7105869054794312, + "learning_rate": 0.0002, + "loss": 1.5333, + "step": 2290 + }, + { + "epoch": 3.9148936170212765, + "grad_norm": 0.825415849685669, + "learning_rate": 0.0002, + "loss": 1.4462, + "step": 2300 + }, + { + "epoch": 3.931914893617021, + "grad_norm": 0.6412091851234436, + "learning_rate": 0.0002, + "loss": 1.5023, + "step": 2310 + }, + { + "epoch": 3.948936170212766, + "grad_norm": 0.6286490559577942, + "learning_rate": 0.0002, + "loss": 1.3709, + "step": 2320 + }, + { + "epoch": 3.9659574468085106, + "grad_norm": 0.636021077632904, + "learning_rate": 0.0002, + "loss": 1.4693, + "step": 2330 + }, + { + "epoch": 3.9829787234042553, + "grad_norm": 0.6032362580299377, + "learning_rate": 0.0002, + "loss": 1.4265, + "step": 2340 + }, + { + "epoch": 4.0, + "grad_norm": 0.6497282385826111, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 2350 + }, + { + "epoch": 4.0, + "eval_loss": 1.9081238508224487, + "eval_runtime": 106.6404, + "eval_samples_per_second": 4.829, + "eval_steps_per_second": 0.61, + "step": 2350 + }, + { + "epoch": 4.017021276595744, + "grad_norm": 0.6278848648071289, + "learning_rate": 0.0002, + "loss": 1.317, + "step": 2360 + }, + { + "epoch": 4.034042553191489, + "grad_norm": 0.8259812593460083, + "learning_rate": 0.0002, + "loss": 1.3229, + "step": 2370 + }, + { + "epoch": 4.051063829787234, + "grad_norm": 0.7269589304924011, + "learning_rate": 0.0002, + "loss": 1.2776, + "step": 2380 + }, + { + "epoch": 4.068085106382979, + "grad_norm": 0.7460662126541138, + "learning_rate": 0.0002, + "loss": 1.3668, + "step": 2390 + }, + { + "epoch": 4.085106382978723, + "grad_norm": 1.2362046241760254, + "learning_rate": 0.0002, + "loss": 1.3096, + "step": 2400 + }, + { + "epoch": 4.102127659574468, + "grad_norm": 0.7699568867683411, + "learning_rate": 0.0002, + "loss": 1.2906, + "step": 2410 + }, + { + "epoch": 4.1191489361702125, + "grad_norm": 0.8732489347457886, + "learning_rate": 0.0002, + "loss": 1.3005, + "step": 2420 + }, + { + "epoch": 4.136170212765958, + "grad_norm": 0.8331889510154724, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 2430 + }, + { + "epoch": 4.153191489361702, + "grad_norm": 0.6686427593231201, + "learning_rate": 0.0002, + "loss": 1.1861, + "step": 2440 + }, + { + "epoch": 4.170212765957447, + "grad_norm": 0.906380832195282, + "learning_rate": 0.0002, + "loss": 1.316, + "step": 2450 + }, + { + "epoch": 4.187234042553191, + "grad_norm": 0.7269753813743591, + "learning_rate": 0.0002, + "loss": 1.3134, + "step": 2460 + }, + { + "epoch": 4.2042553191489365, + "grad_norm": 0.8556067943572998, + "learning_rate": 0.0002, + "loss": 1.299, + "step": 2470 + }, + { + "epoch": 4.221276595744681, + "grad_norm": 0.7076917886734009, + "learning_rate": 0.0002, + "loss": 1.2935, + "step": 2480 + }, + { + "epoch": 4.238297872340426, + "grad_norm": 0.7596837282180786, + "learning_rate": 0.0002, + "loss": 1.2608, + "step": 2490 + }, + { + "epoch": 4.25531914893617, + "grad_norm": 0.7790552377700806, + "learning_rate": 0.0002, + "loss": 1.2747, + "step": 2500 + }, + { + "epoch": 4.272340425531915, + "grad_norm": 0.8205534219741821, + "learning_rate": 0.0002, + "loss": 1.3438, + "step": 2510 + }, + { + "epoch": 4.2893617021276595, + "grad_norm": 0.7892114520072937, + "learning_rate": 0.0002, + "loss": 1.3058, + "step": 2520 + }, + { + "epoch": 4.306382978723404, + "grad_norm": 0.8907270431518555, + "learning_rate": 0.0002, + "loss": 1.3662, + "step": 2530 + }, + { + "epoch": 4.323404255319149, + "grad_norm": 0.821794331073761, + "learning_rate": 0.0002, + "loss": 1.3168, + "step": 2540 + }, + { + "epoch": 4.340425531914893, + "grad_norm": 0.7305247783660889, + "learning_rate": 0.0002, + "loss": 1.2467, + "step": 2550 + }, + { + "epoch": 4.357446808510638, + "grad_norm": 0.8639982342720032, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 2560 + }, + { + "epoch": 4.374468085106383, + "grad_norm": 0.8883494138717651, + "learning_rate": 0.0002, + "loss": 1.3863, + "step": 2570 + }, + { + "epoch": 4.391489361702128, + "grad_norm": 0.7611730098724365, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 2580 + }, + { + "epoch": 4.408510638297872, + "grad_norm": 0.7793022394180298, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 2590 + }, + { + "epoch": 4.425531914893617, + "grad_norm": 0.979060173034668, + "learning_rate": 0.0002, + "loss": 1.3014, + "step": 2600 + }, + { + "epoch": 4.4425531914893615, + "grad_norm": 0.8320847749710083, + "learning_rate": 0.0002, + "loss": 1.3625, + "step": 2610 + }, + { + "epoch": 4.459574468085107, + "grad_norm": 0.7481992244720459, + "learning_rate": 0.0002, + "loss": 1.3362, + "step": 2620 + }, + { + "epoch": 4.476595744680851, + "grad_norm": 0.783770740032196, + "learning_rate": 0.0002, + "loss": 1.4037, + "step": 2630 + }, + { + "epoch": 4.493617021276596, + "grad_norm": 0.773295521736145, + "learning_rate": 0.0002, + "loss": 1.3049, + "step": 2640 + }, + { + "epoch": 4.51063829787234, + "grad_norm": 0.9206840991973877, + "learning_rate": 0.0002, + "loss": 1.2739, + "step": 2650 + }, + { + "epoch": 4.527659574468085, + "grad_norm": 0.8803266882896423, + "learning_rate": 0.0002, + "loss": 1.3248, + "step": 2660 + }, + { + "epoch": 4.54468085106383, + "grad_norm": 0.9315535426139832, + "learning_rate": 0.0002, + "loss": 1.2898, + "step": 2670 + }, + { + "epoch": 4.561702127659575, + "grad_norm": 0.8610678315162659, + "learning_rate": 0.0002, + "loss": 1.316, + "step": 2680 + }, + { + "epoch": 4.578723404255319, + "grad_norm": 0.7405551671981812, + "learning_rate": 0.0002, + "loss": 1.2633, + "step": 2690 + }, + { + "epoch": 4.595744680851064, + "grad_norm": 1.0238394737243652, + "learning_rate": 0.0002, + "loss": 1.3136, + "step": 2700 + }, + { + "epoch": 4.6127659574468085, + "grad_norm": 0.7814345955848694, + "learning_rate": 0.0002, + "loss": 1.4847, + "step": 2710 + }, + { + "epoch": 4.629787234042553, + "grad_norm": 0.8436329364776611, + "learning_rate": 0.0002, + "loss": 1.295, + "step": 2720 + }, + { + "epoch": 4.646808510638298, + "grad_norm": 0.727214515209198, + "learning_rate": 0.0002, + "loss": 1.3525, + "step": 2730 + }, + { + "epoch": 4.663829787234042, + "grad_norm": 0.8465878367424011, + "learning_rate": 0.0002, + "loss": 1.3878, + "step": 2740 + }, + { + "epoch": 4.680851063829787, + "grad_norm": 0.8218137621879578, + "learning_rate": 0.0002, + "loss": 1.278, + "step": 2750 + }, + { + "epoch": 4.697872340425532, + "grad_norm": 0.7900442481040955, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 2760 + }, + { + "epoch": 4.714893617021277, + "grad_norm": 0.8214074969291687, + "learning_rate": 0.0002, + "loss": 1.3494, + "step": 2770 + }, + { + "epoch": 4.731914893617021, + "grad_norm": 0.7509574890136719, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 2780 + }, + { + "epoch": 4.748936170212766, + "grad_norm": 0.7416139245033264, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 2790 + }, + { + "epoch": 4.76595744680851, + "grad_norm": 0.8629977107048035, + "learning_rate": 0.0002, + "loss": 1.3045, + "step": 2800 + }, + { + "epoch": 4.782978723404256, + "grad_norm": 0.8056505918502808, + "learning_rate": 0.0002, + "loss": 1.3164, + "step": 2810 + }, + { + "epoch": 4.8, + "grad_norm": 0.7705401182174683, + "learning_rate": 0.0002, + "loss": 1.3056, + "step": 2820 + }, + { + "epoch": 4.817021276595745, + "grad_norm": 1.0173288583755493, + "learning_rate": 0.0002, + "loss": 1.3771, + "step": 2830 + }, + { + "epoch": 4.834042553191489, + "grad_norm": 0.8375823497772217, + "learning_rate": 0.0002, + "loss": 1.3494, + "step": 2840 + }, + { + "epoch": 4.851063829787234, + "grad_norm": 0.857073187828064, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 2850 + }, + { + "epoch": 4.868085106382979, + "grad_norm": 0.8672189712524414, + "learning_rate": 0.0002, + "loss": 1.2964, + "step": 2860 + }, + { + "epoch": 4.885106382978723, + "grad_norm": 0.8599910140037537, + "learning_rate": 0.0002, + "loss": 1.3646, + "step": 2870 + }, + { + "epoch": 4.902127659574468, + "grad_norm": 0.8844674229621887, + "learning_rate": 0.0002, + "loss": 1.3575, + "step": 2880 + }, + { + "epoch": 4.919148936170213, + "grad_norm": 0.8246751427650452, + "learning_rate": 0.0002, + "loss": 1.285, + "step": 2890 + }, + { + "epoch": 4.9361702127659575, + "grad_norm": 0.8648163676261902, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 2900 + }, + { + "epoch": 4.953191489361702, + "grad_norm": 0.9477900266647339, + "learning_rate": 0.0002, + "loss": 1.2614, + "step": 2910 + }, + { + "epoch": 4.970212765957447, + "grad_norm": 0.8047965168952942, + "learning_rate": 0.0002, + "loss": 1.3519, + "step": 2920 + }, + { + "epoch": 4.987234042553191, + "grad_norm": 0.9872494339942932, + "learning_rate": 0.0002, + "loss": 1.3889, + "step": 2930 + }, + { + "epoch": 4.999148936170212, + "eval_loss": 1.9836769104003906, + "eval_runtime": 106.4655, + "eval_samples_per_second": 4.837, + "eval_steps_per_second": 0.611, + "step": 2937 + }, + { + "epoch": 5.004255319148936, + "grad_norm": 0.7292938828468323, + "learning_rate": 0.0002, + "loss": 1.2574, + "step": 2940 + }, + { + "epoch": 5.0212765957446805, + "grad_norm": 0.8610548973083496, + "learning_rate": 0.0002, + "loss": 1.1312, + "step": 2950 + }, + { + "epoch": 5.038297872340426, + "grad_norm": 0.8384576439857483, + "learning_rate": 0.0002, + "loss": 1.1105, + "step": 2960 + }, + { + "epoch": 5.05531914893617, + "grad_norm": 0.9746620059013367, + "learning_rate": 0.0002, + "loss": 1.1412, + "step": 2970 + }, + { + "epoch": 5.072340425531915, + "grad_norm": 0.8879048228263855, + "learning_rate": 0.0002, + "loss": 1.1687, + "step": 2980 + }, + { + "epoch": 5.089361702127659, + "grad_norm": 0.9006168246269226, + "learning_rate": 0.0002, + "loss": 1.1333, + "step": 2990 + }, + { + "epoch": 5.1063829787234045, + "grad_norm": 0.9770249128341675, + "learning_rate": 0.0002, + "loss": 1.1659, + "step": 3000 + }, + { + "epoch": 5.123404255319149, + "grad_norm": 1.267967939376831, + "learning_rate": 0.0002, + "loss": 1.1334, + "step": 3010 + }, + { + "epoch": 5.140425531914894, + "grad_norm": 0.9857587218284607, + "learning_rate": 0.0002, + "loss": 1.2095, + "step": 3020 + }, + { + "epoch": 5.157446808510638, + "grad_norm": 1.2938690185546875, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 3030 + }, + { + "epoch": 5.174468085106383, + "grad_norm": 0.8928244113922119, + "learning_rate": 0.0002, + "loss": 1.1645, + "step": 3040 + }, + { + "epoch": 5.191489361702128, + "grad_norm": 1.1087630987167358, + "learning_rate": 0.0002, + "loss": 1.1553, + "step": 3050 + }, + { + "epoch": 5.208510638297873, + "grad_norm": 0.9431360960006714, + "learning_rate": 0.0002, + "loss": 1.1416, + "step": 3060 + }, + { + "epoch": 5.225531914893617, + "grad_norm": 1.2048338651657104, + "learning_rate": 0.0002, + "loss": 1.1635, + "step": 3070 + }, + { + "epoch": 5.242553191489361, + "grad_norm": 1.0017054080963135, + "learning_rate": 0.0002, + "loss": 1.171, + "step": 3080 + }, + { + "epoch": 5.259574468085106, + "grad_norm": 1.2771434783935547, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 3090 + }, + { + "epoch": 5.276595744680851, + "grad_norm": 1.4307383298873901, + "learning_rate": 0.0002, + "loss": 1.1478, + "step": 3100 + }, + { + "epoch": 5.293617021276596, + "grad_norm": 1.2460752725601196, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 3110 + }, + { + "epoch": 5.31063829787234, + "grad_norm": 1.693974494934082, + "learning_rate": 0.0002, + "loss": 1.235, + "step": 3120 + }, + { + "epoch": 5.327659574468085, + "grad_norm": 0.9855408668518066, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 3130 + }, + { + "epoch": 5.3446808510638295, + "grad_norm": 1.307521104812622, + "learning_rate": 0.0002, + "loss": 1.2068, + "step": 3140 + }, + { + "epoch": 5.361702127659575, + "grad_norm": 0.957661509513855, + "learning_rate": 0.0002, + "loss": 1.2144, + "step": 3150 + }, + { + "epoch": 5.378723404255319, + "grad_norm": 0.870373010635376, + "learning_rate": 0.0002, + "loss": 1.1305, + "step": 3160 + }, + { + "epoch": 5.395744680851064, + "grad_norm": 0.9324309229850769, + "learning_rate": 0.0002, + "loss": 1.2196, + "step": 3170 + }, + { + "epoch": 5.412765957446808, + "grad_norm": 1.0142403841018677, + "learning_rate": 0.0002, + "loss": 1.1691, + "step": 3180 + }, + { + "epoch": 5.4297872340425535, + "grad_norm": 0.9759578704833984, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 3190 + }, + { + "epoch": 5.446808510638298, + "grad_norm": 0.9021993279457092, + "learning_rate": 0.0002, + "loss": 1.1321, + "step": 3200 + }, + { + "epoch": 5.463829787234043, + "grad_norm": 1.007728934288025, + "learning_rate": 0.0002, + "loss": 1.2222, + "step": 3210 + }, + { + "epoch": 5.480851063829787, + "grad_norm": 0.8969265222549438, + "learning_rate": 0.0002, + "loss": 1.1517, + "step": 3220 + }, + { + "epoch": 5.497872340425532, + "grad_norm": 0.9672483801841736, + "learning_rate": 0.0002, + "loss": 1.2061, + "step": 3230 + }, + { + "epoch": 5.514893617021277, + "grad_norm": 1.1417138576507568, + "learning_rate": 0.0002, + "loss": 1.1454, + "step": 3240 + }, + { + "epoch": 5.531914893617021, + "grad_norm": 0.9669530391693115, + "learning_rate": 0.0002, + "loss": 1.1871, + "step": 3250 + }, + { + "epoch": 5.548936170212766, + "grad_norm": 1.0161820650100708, + "learning_rate": 0.0002, + "loss": 1.1382, + "step": 3260 + }, + { + "epoch": 5.565957446808511, + "grad_norm": 0.9935774803161621, + "learning_rate": 0.0002, + "loss": 1.1708, + "step": 3270 + }, + { + "epoch": 5.582978723404255, + "grad_norm": 1.2572048902511597, + "learning_rate": 0.0002, + "loss": 1.1384, + "step": 3280 + }, + { + "epoch": 5.6, + "grad_norm": 0.9614662528038025, + "learning_rate": 0.0002, + "loss": 1.1711, + "step": 3290 + }, + { + "epoch": 5.617021276595745, + "grad_norm": 0.9835584163665771, + "learning_rate": 0.0002, + "loss": 1.219, + "step": 3300 + }, + { + "epoch": 5.634042553191489, + "grad_norm": 0.9387389421463013, + "learning_rate": 0.0002, + "loss": 1.2074, + "step": 3310 + }, + { + "epoch": 5.651063829787234, + "grad_norm": 0.9348428249359131, + "learning_rate": 0.0002, + "loss": 1.1148, + "step": 3320 + }, + { + "epoch": 5.6680851063829785, + "grad_norm": 0.9636440873146057, + "learning_rate": 0.0002, + "loss": 1.2378, + "step": 3330 + }, + { + "epoch": 5.685106382978724, + "grad_norm": 0.995894193649292, + "learning_rate": 0.0002, + "loss": 1.2068, + "step": 3340 + }, + { + "epoch": 5.702127659574468, + "grad_norm": 1.0357023477554321, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 3350 + }, + { + "epoch": 5.719148936170213, + "grad_norm": 1.0254428386688232, + "learning_rate": 0.0002, + "loss": 1.2209, + "step": 3360 + }, + { + "epoch": 5.736170212765957, + "grad_norm": 0.8993342518806458, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 3370 + }, + { + "epoch": 5.753191489361702, + "grad_norm": 0.9104585647583008, + "learning_rate": 0.0002, + "loss": 1.1527, + "step": 3380 + }, + { + "epoch": 5.770212765957447, + "grad_norm": 0.9555654525756836, + "learning_rate": 0.0002, + "loss": 1.2268, + "step": 3390 + }, + { + "epoch": 5.787234042553192, + "grad_norm": 0.920124351978302, + "learning_rate": 0.0002, + "loss": 1.193, + "step": 3400 + }, + { + "epoch": 5.804255319148936, + "grad_norm": 0.999706506729126, + "learning_rate": 0.0002, + "loss": 1.2263, + "step": 3410 + }, + { + "epoch": 5.821276595744681, + "grad_norm": 0.9292707443237305, + "learning_rate": 0.0002, + "loss": 1.1411, + "step": 3420 + }, + { + "epoch": 5.8382978723404255, + "grad_norm": 1.0074706077575684, + "learning_rate": 0.0002, + "loss": 1.1507, + "step": 3430 + }, + { + "epoch": 5.85531914893617, + "grad_norm": 1.0279479026794434, + "learning_rate": 0.0002, + "loss": 1.2709, + "step": 3440 + }, + { + "epoch": 5.872340425531915, + "grad_norm": 1.0026037693023682, + "learning_rate": 0.0002, + "loss": 1.1992, + "step": 3450 + }, + { + "epoch": 5.889361702127659, + "grad_norm": 1.0356525182724, + "learning_rate": 0.0002, + "loss": 1.1416, + "step": 3460 + }, + { + "epoch": 5.906382978723404, + "grad_norm": 1.1106643676757812, + "learning_rate": 0.0002, + "loss": 1.224, + "step": 3470 + }, + { + "epoch": 5.923404255319149, + "grad_norm": 0.9578408002853394, + "learning_rate": 0.0002, + "loss": 1.1955, + "step": 3480 + }, + { + "epoch": 5.940425531914894, + "grad_norm": 1.0225932598114014, + "learning_rate": 0.0002, + "loss": 1.2133, + "step": 3490 + }, + { + "epoch": 5.957446808510638, + "grad_norm": 0.9677667021751404, + "learning_rate": 0.0002, + "loss": 1.157, + "step": 3500 + }, + { + "epoch": 5.974468085106383, + "grad_norm": 1.0967241525650024, + "learning_rate": 0.0002, + "loss": 1.2196, + "step": 3510 + }, + { + "epoch": 5.991489361702127, + "grad_norm": 1.2497339248657227, + "learning_rate": 0.0002, + "loss": 1.1807, + "step": 3520 + }, + { + "epoch": 6.0, + "eval_loss": 2.0976572036743164, + "eval_runtime": 105.9679, + "eval_samples_per_second": 4.86, + "eval_steps_per_second": 0.613, + "step": 3525 + }, + { + "epoch": 6.008510638297873, + "grad_norm": 0.9660930037498474, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 3530 + }, + { + "epoch": 6.025531914893617, + "grad_norm": 0.9462300539016724, + "learning_rate": 0.0002, + "loss": 1.0043, + "step": 3540 + }, + { + "epoch": 6.042553191489362, + "grad_norm": 0.9312542676925659, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 3550 + }, + { + "epoch": 6.059574468085106, + "grad_norm": 1.3502222299575806, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 3560 + }, + { + "epoch": 6.076595744680851, + "grad_norm": 1.2838709354400635, + "learning_rate": 0.0002, + "loss": 0.9167, + "step": 3570 + }, + { + "epoch": 6.093617021276596, + "grad_norm": 1.1399385929107666, + "learning_rate": 0.0002, + "loss": 0.9381, + "step": 3580 + }, + { + "epoch": 6.110638297872341, + "grad_norm": 1.1763123273849487, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 3590 + }, + { + "epoch": 6.127659574468085, + "grad_norm": 1.113002061843872, + "learning_rate": 0.0002, + "loss": 0.9782, + "step": 3600 + }, + { + "epoch": 6.14468085106383, + "grad_norm": 1.0322953462600708, + "learning_rate": 0.0002, + "loss": 0.9521, + "step": 3610 + }, + { + "epoch": 6.1617021276595745, + "grad_norm": 1.2678894996643066, + "learning_rate": 0.0002, + "loss": 0.9114, + "step": 3620 + }, + { + "epoch": 6.178723404255319, + "grad_norm": 1.2370864152908325, + "learning_rate": 0.0002, + "loss": 0.9934, + "step": 3630 + }, + { + "epoch": 6.195744680851064, + "grad_norm": 1.1930763721466064, + "learning_rate": 0.0002, + "loss": 0.9753, + "step": 3640 + }, + { + "epoch": 6.212765957446808, + "grad_norm": 1.3608582019805908, + "learning_rate": 0.0002, + "loss": 0.9448, + "step": 3650 + }, + { + "epoch": 6.229787234042553, + "grad_norm": 1.2158547639846802, + "learning_rate": 0.0002, + "loss": 1.0201, + "step": 3660 + }, + { + "epoch": 6.246808510638298, + "grad_norm": 1.1505420207977295, + "learning_rate": 0.0002, + "loss": 0.9896, + "step": 3670 + }, + { + "epoch": 6.263829787234043, + "grad_norm": 1.3038114309310913, + "learning_rate": 0.0002, + "loss": 1.0088, + "step": 3680 + }, + { + "epoch": 6.280851063829787, + "grad_norm": 1.3900057077407837, + "learning_rate": 0.0002, + "loss": 1.0416, + "step": 3690 + }, + { + "epoch": 6.297872340425532, + "grad_norm": 1.196964144706726, + "learning_rate": 0.0002, + "loss": 0.9832, + "step": 3700 + }, + { + "epoch": 6.314893617021276, + "grad_norm": 1.205865740776062, + "learning_rate": 0.0002, + "loss": 1.0778, + "step": 3710 + }, + { + "epoch": 6.3319148936170215, + "grad_norm": 1.2710838317871094, + "learning_rate": 0.0002, + "loss": 1.0358, + "step": 3720 + }, + { + "epoch": 6.348936170212766, + "grad_norm": 1.285942554473877, + "learning_rate": 0.0002, + "loss": 1.0271, + "step": 3730 + }, + { + "epoch": 6.365957446808511, + "grad_norm": 1.1717636585235596, + "learning_rate": 0.0002, + "loss": 1.0164, + "step": 3740 + }, + { + "epoch": 6.382978723404255, + "grad_norm": 1.190883994102478, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 3750 + }, + { + "epoch": 6.4, + "grad_norm": 1.1623435020446777, + "learning_rate": 0.0002, + "loss": 1.0319, + "step": 3760 + }, + { + "epoch": 6.417021276595745, + "grad_norm": 1.2285547256469727, + "learning_rate": 0.0002, + "loss": 1.0633, + "step": 3770 + }, + { + "epoch": 6.43404255319149, + "grad_norm": 1.1142666339874268, + "learning_rate": 0.0002, + "loss": 1.0593, + "step": 3780 + }, + { + "epoch": 6.451063829787234, + "grad_norm": 1.333337664604187, + "learning_rate": 0.0002, + "loss": 1.0418, + "step": 3790 + }, + { + "epoch": 6.468085106382979, + "grad_norm": 1.350474238395691, + "learning_rate": 0.0002, + "loss": 1.0, + "step": 3800 + }, + { + "epoch": 6.485106382978723, + "grad_norm": 1.2439061403274536, + "learning_rate": 0.0002, + "loss": 1.1152, + "step": 3810 + }, + { + "epoch": 6.502127659574468, + "grad_norm": 1.2488664388656616, + "learning_rate": 0.0002, + "loss": 1.0915, + "step": 3820 + }, + { + "epoch": 6.519148936170213, + "grad_norm": 1.1990735530853271, + "learning_rate": 0.0002, + "loss": 1.0571, + "step": 3830 + }, + { + "epoch": 6.536170212765957, + "grad_norm": 1.5180301666259766, + "learning_rate": 0.0002, + "loss": 0.9895, + "step": 3840 + }, + { + "epoch": 6.553191489361702, + "grad_norm": 1.1273280382156372, + "learning_rate": 0.0002, + "loss": 0.9955, + "step": 3850 + }, + { + "epoch": 6.5702127659574465, + "grad_norm": 1.2778105735778809, + "learning_rate": 0.0002, + "loss": 1.0516, + "step": 3860 + }, + { + "epoch": 6.587234042553192, + "grad_norm": 1.1789685487747192, + "learning_rate": 0.0002, + "loss": 1.0039, + "step": 3870 + }, + { + "epoch": 6.604255319148936, + "grad_norm": 1.2061398029327393, + "learning_rate": 0.0002, + "loss": 1.0381, + "step": 3880 + }, + { + "epoch": 6.621276595744681, + "grad_norm": 1.104092001914978, + "learning_rate": 0.0002, + "loss": 1.0775, + "step": 3890 + }, + { + "epoch": 6.638297872340425, + "grad_norm": 1.2648544311523438, + "learning_rate": 0.0002, + "loss": 1.0591, + "step": 3900 + }, + { + "epoch": 6.6553191489361705, + "grad_norm": 1.2267687320709229, + "learning_rate": 0.0002, + "loss": 1.0535, + "step": 3910 + }, + { + "epoch": 6.672340425531915, + "grad_norm": 1.3252530097961426, + "learning_rate": 0.0002, + "loss": 1.0654, + "step": 3920 + }, + { + "epoch": 6.68936170212766, + "grad_norm": 1.284563660621643, + "learning_rate": 0.0002, + "loss": 1.0301, + "step": 3930 + }, + { + "epoch": 6.706382978723404, + "grad_norm": 1.293845534324646, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 3940 + }, + { + "epoch": 6.723404255319149, + "grad_norm": 1.2290467023849487, + "learning_rate": 0.0002, + "loss": 1.1526, + "step": 3950 + }, + { + "epoch": 6.740425531914894, + "grad_norm": 1.1712737083435059, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 3960 + }, + { + "epoch": 6.757446808510638, + "grad_norm": 1.1728616952896118, + "learning_rate": 0.0002, + "loss": 1.0149, + "step": 3970 + }, + { + "epoch": 6.774468085106383, + "grad_norm": 1.154922604560852, + "learning_rate": 0.0002, + "loss": 1.0824, + "step": 3980 + }, + { + "epoch": 6.791489361702128, + "grad_norm": 1.4673690795898438, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 3990 + }, + { + "epoch": 6.808510638297872, + "grad_norm": 1.2338067293167114, + "learning_rate": 0.0002, + "loss": 0.9784, + "step": 4000 + }, + { + "epoch": 6.825531914893617, + "grad_norm": 1.0775316953659058, + "learning_rate": 0.0002, + "loss": 1.0975, + "step": 4010 + }, + { + "epoch": 6.842553191489362, + "grad_norm": 1.2518454790115356, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 4020 + }, + { + "epoch": 6.859574468085106, + "grad_norm": 1.3534432649612427, + "learning_rate": 0.0002, + "loss": 1.1425, + "step": 4030 + }, + { + "epoch": 6.876595744680851, + "grad_norm": 1.1217902898788452, + "learning_rate": 0.0002, + "loss": 1.1212, + "step": 4040 + }, + { + "epoch": 6.8936170212765955, + "grad_norm": 1.2672910690307617, + "learning_rate": 0.0002, + "loss": 1.0823, + "step": 4050 + }, + { + "epoch": 6.910638297872341, + "grad_norm": 1.3807674646377563, + "learning_rate": 0.0002, + "loss": 1.0817, + "step": 4060 + }, + { + "epoch": 6.927659574468085, + "grad_norm": 1.064530849456787, + "learning_rate": 0.0002, + "loss": 1.0576, + "step": 4070 + }, + { + "epoch": 6.94468085106383, + "grad_norm": 1.1286897659301758, + "learning_rate": 0.0002, + "loss": 1.0718, + "step": 4080 + }, + { + "epoch": 6.961702127659574, + "grad_norm": 1.3736463785171509, + "learning_rate": 0.0002, + "loss": 1.0574, + "step": 4090 + }, + { + "epoch": 6.9787234042553195, + "grad_norm": 1.3167431354522705, + "learning_rate": 0.0002, + "loss": 1.0621, + "step": 4100 + }, + { + "epoch": 6.995744680851064, + "grad_norm": 1.2784067392349243, + "learning_rate": 0.0002, + "loss": 0.9754, + "step": 4110 + }, + { + "epoch": 6.999148936170212, + "eval_loss": 2.260930299758911, + "eval_runtime": 106.0392, + "eval_samples_per_second": 4.857, + "eval_steps_per_second": 0.613, + "step": 4112 + }, + { + "epoch": 7.012765957446809, + "grad_norm": 1.1155035495758057, + "learning_rate": 0.0002, + "loss": 0.8211, + "step": 4120 + }, + { + "epoch": 7.029787234042553, + "grad_norm": 1.4007865190505981, + "learning_rate": 0.0002, + "loss": 0.8606, + "step": 4130 + }, + { + "epoch": 7.046808510638298, + "grad_norm": 1.4097480773925781, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 4140 + }, + { + "epoch": 7.0638297872340425, + "grad_norm": 1.5067437887191772, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 4150 + }, + { + "epoch": 7.080851063829787, + "grad_norm": 1.8971672058105469, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 4160 + }, + { + "epoch": 7.097872340425532, + "grad_norm": 1.257439136505127, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 4170 + }, + { + "epoch": 7.114893617021276, + "grad_norm": 1.3088364601135254, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 4180 + }, + { + "epoch": 7.131914893617021, + "grad_norm": 1.224184274673462, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 4190 + }, + { + "epoch": 7.148936170212766, + "grad_norm": 1.5408329963684082, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 4200 + }, + { + "epoch": 7.165957446808511, + "grad_norm": 1.6859279870986938, + "learning_rate": 0.0002, + "loss": 0.8345, + "step": 4210 + }, + { + "epoch": 7.182978723404255, + "grad_norm": 1.4212250709533691, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 4220 + }, + { + "epoch": 7.2, + "grad_norm": 1.5859991312026978, + "learning_rate": 0.0002, + "loss": 0.9346, + "step": 4230 + }, + { + "epoch": 7.217021276595744, + "grad_norm": 1.4653054475784302, + "learning_rate": 0.0002, + "loss": 0.8674, + "step": 4240 + }, + { + "epoch": 7.23404255319149, + "grad_norm": 1.567806363105774, + "learning_rate": 0.0002, + "loss": 0.913, + "step": 4250 + }, + { + "epoch": 7.251063829787234, + "grad_norm": 1.470809817314148, + "learning_rate": 0.0002, + "loss": 0.9355, + "step": 4260 + }, + { + "epoch": 7.268085106382979, + "grad_norm": 1.326292634010315, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 4270 + }, + { + "epoch": 7.285106382978723, + "grad_norm": 1.4706473350524902, + "learning_rate": 0.0002, + "loss": 0.8593, + "step": 4280 + }, + { + "epoch": 7.302127659574468, + "grad_norm": 1.9928194284439087, + "learning_rate": 0.0002, + "loss": 0.8788, + "step": 4290 + }, + { + "epoch": 7.319148936170213, + "grad_norm": 1.2895413637161255, + "learning_rate": 0.0002, + "loss": 0.8759, + "step": 4300 + }, + { + "epoch": 7.336170212765958, + "grad_norm": 1.5898326635360718, + "learning_rate": 0.0002, + "loss": 0.8887, + "step": 4310 + }, + { + "epoch": 7.353191489361702, + "grad_norm": 1.4953527450561523, + "learning_rate": 0.0002, + "loss": 0.8632, + "step": 4320 + }, + { + "epoch": 7.370212765957447, + "grad_norm": 1.465372085571289, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 4330 + }, + { + "epoch": 7.3872340425531915, + "grad_norm": 1.5092062950134277, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 4340 + }, + { + "epoch": 7.404255319148936, + "grad_norm": 1.3567780256271362, + "learning_rate": 0.0002, + "loss": 0.9551, + "step": 4350 + }, + { + "epoch": 7.421276595744681, + "grad_norm": 1.5023396015167236, + "learning_rate": 0.0002, + "loss": 0.8667, + "step": 4360 + }, + { + "epoch": 7.438297872340425, + "grad_norm": 1.6369168758392334, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 4370 + }, + { + "epoch": 7.45531914893617, + "grad_norm": 1.4093835353851318, + "learning_rate": 0.0002, + "loss": 0.9313, + "step": 4380 + }, + { + "epoch": 7.472340425531915, + "grad_norm": 1.2725355625152588, + "learning_rate": 0.0002, + "loss": 0.861, + "step": 4390 + }, + { + "epoch": 7.48936170212766, + "grad_norm": 1.455870509147644, + "learning_rate": 0.0002, + "loss": 0.9065, + "step": 4400 + }, + { + "epoch": 7.506382978723404, + "grad_norm": 1.2592545747756958, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4410 + }, + { + "epoch": 7.523404255319149, + "grad_norm": 1.614005208015442, + "learning_rate": 0.0002, + "loss": 0.9038, + "step": 4420 + }, + { + "epoch": 7.540425531914893, + "grad_norm": 1.4367144107818604, + "learning_rate": 0.0002, + "loss": 0.9177, + "step": 4430 + }, + { + "epoch": 7.5574468085106385, + "grad_norm": 1.3691469430923462, + "learning_rate": 0.0002, + "loss": 0.8685, + "step": 4440 + }, + { + "epoch": 7.574468085106383, + "grad_norm": 1.6138449907302856, + "learning_rate": 0.0002, + "loss": 0.8757, + "step": 4450 + }, + { + "epoch": 7.591489361702128, + "grad_norm": 1.3140075206756592, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 4460 + }, + { + "epoch": 7.608510638297872, + "grad_norm": 1.482589602470398, + "learning_rate": 0.0002, + "loss": 0.9237, + "step": 4470 + }, + { + "epoch": 7.625531914893617, + "grad_norm": 1.404107928276062, + "learning_rate": 0.0002, + "loss": 0.9249, + "step": 4480 + }, + { + "epoch": 7.642553191489362, + "grad_norm": 1.6977661848068237, + "learning_rate": 0.0002, + "loss": 0.9213, + "step": 4490 + }, + { + "epoch": 7.659574468085106, + "grad_norm": 1.4678088426589966, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 4500 + }, + { + "epoch": 7.676595744680851, + "grad_norm": 1.7297770977020264, + "learning_rate": 0.0002, + "loss": 0.9467, + "step": 4510 + }, + { + "epoch": 7.693617021276596, + "grad_norm": 1.5900875329971313, + "learning_rate": 0.0002, + "loss": 0.93, + "step": 4520 + }, + { + "epoch": 7.7106382978723405, + "grad_norm": 1.620308756828308, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 4530 + }, + { + "epoch": 7.727659574468085, + "grad_norm": 1.4710882902145386, + "learning_rate": 0.0002, + "loss": 1.002, + "step": 4540 + }, + { + "epoch": 7.74468085106383, + "grad_norm": 1.51741361618042, + "learning_rate": 0.0002, + "loss": 0.9126, + "step": 4550 + }, + { + "epoch": 7.761702127659574, + "grad_norm": 1.5683188438415527, + "learning_rate": 0.0002, + "loss": 0.9209, + "step": 4560 + }, + { + "epoch": 7.778723404255319, + "grad_norm": 1.387294888496399, + "learning_rate": 0.0002, + "loss": 0.9852, + "step": 4570 + }, + { + "epoch": 7.7957446808510635, + "grad_norm": 1.3634133338928223, + "learning_rate": 0.0002, + "loss": 0.9205, + "step": 4580 + }, + { + "epoch": 7.812765957446809, + "grad_norm": 1.469403624534607, + "learning_rate": 0.0002, + "loss": 0.9959, + "step": 4590 + }, + { + "epoch": 7.829787234042553, + "grad_norm": 1.5683388710021973, + "learning_rate": 0.0002, + "loss": 0.8934, + "step": 4600 + }, + { + "epoch": 7.846808510638298, + "grad_norm": 1.3234552145004272, + "learning_rate": 0.0002, + "loss": 0.8806, + "step": 4610 + }, + { + "epoch": 7.863829787234042, + "grad_norm": 1.2532844543457031, + "learning_rate": 0.0002, + "loss": 0.9353, + "step": 4620 + }, + { + "epoch": 7.8808510638297875, + "grad_norm": 1.3591208457946777, + "learning_rate": 0.0002, + "loss": 0.8865, + "step": 4630 + }, + { + "epoch": 7.897872340425532, + "grad_norm": 1.366128921508789, + "learning_rate": 0.0002, + "loss": 0.9419, + "step": 4640 + }, + { + "epoch": 7.914893617021277, + "grad_norm": 1.3230071067810059, + "learning_rate": 0.0002, + "loss": 0.9076, + "step": 4650 + }, + { + "epoch": 7.931914893617021, + "grad_norm": 1.3713736534118652, + "learning_rate": 0.0002, + "loss": 0.9076, + "step": 4660 + }, + { + "epoch": 7.948936170212766, + "grad_norm": 1.4915863275527954, + "learning_rate": 0.0002, + "loss": 0.9455, + "step": 4670 + }, + { + "epoch": 7.965957446808511, + "grad_norm": 1.1782197952270508, + "learning_rate": 0.0002, + "loss": 0.8768, + "step": 4680 + }, + { + "epoch": 7.982978723404255, + "grad_norm": 1.3456854820251465, + "learning_rate": 0.0002, + "loss": 0.93, + "step": 4690 + }, + { + "epoch": 7.993191489361702, + "eval_loss": 2.3770549297332764, + "eval_runtime": 107.186, + "eval_samples_per_second": 4.805, + "eval_steps_per_second": 0.606, + "step": 4696 + } + ], + "logging_steps": 10, + "max_steps": 4696, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.1732036523012915e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..40bc5c2205bcd1402ce337f3a218ef4840b3870a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-4696/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:148f55f87c9f695bdf2cf6d54b37e690c06ff5da5e17bc3af6c1f44f90f45374 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..55092d52da6531de5855a48ed8ad88204a5d091a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0885db5696862a9f827451ce08861ba6cdff536f0f3a4d966ec82aec1e800dc +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe633c5d1761e456ffe47f0dc1cc65b21cb453e0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7a6d38ff5a5b1fa5b61b5508b341b97d606f07ea9501b3e5adb65ef4d908e0d +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ec1ece502212a9b86365e5de8e74225bc664e64b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71da6bc7745b58e4d783cbb59d348e9c645073417e2d3e6ea126f870f57390b4 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..66c612c8e58eb69e457a4531b840f27005888ab9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3e2f8662e12de311299efa373704fcb1a782f088ff56f7e82d05161c310deac +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1cd75f679df1135372f5605c5e22bb5cdf97ed05 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/trainer_state.json @@ -0,0 +1,447 @@ +{ + "best_metric": 1.8388911485671997, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587", + "epoch": 0.9991489361702127, + "eval_steps": 10, + "global_step": 587, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01702127659574468, + "grad_norm": 0.7596228122711182, + "learning_rate": 0.0002, + "loss": 2.5996, + "step": 10 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 0.4860903322696686, + "learning_rate": 0.0002, + "loss": 2.2941, + "step": 20 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 0.4953401982784271, + "learning_rate": 0.0002, + "loss": 2.0018, + "step": 30 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 0.5086901783943176, + "learning_rate": 0.0002, + "loss": 1.9318, + "step": 40 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 0.49050021171569824, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 50 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 0.4922358989715576, + "learning_rate": 0.0002, + "loss": 1.8786, + "step": 60 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 0.4621541202068329, + "learning_rate": 0.0002, + "loss": 1.8812, + "step": 70 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 0.4416729807853699, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 80 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 0.526258111000061, + "learning_rate": 0.0002, + "loss": 1.9298, + "step": 90 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 0.44022637605667114, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 100 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 0.4647711515426636, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 110 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 0.4136318564414978, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 120 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 0.39707672595977783, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 130 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 0.4478105306625366, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 140 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 0.4699741303920746, + "learning_rate": 0.0002, + "loss": 1.8067, + "step": 150 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 0.4568363130092621, + "learning_rate": 0.0002, + "loss": 1.8161, + "step": 160 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 0.45078757405281067, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 170 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 0.4127245843410492, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 180 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 0.4042493402957916, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 190 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 0.401487797498703, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 200 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 0.3959457576274872, + "learning_rate": 0.0002, + "loss": 1.8355, + "step": 210 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 0.39865636825561523, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 220 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 0.7225169539451599, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 230 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 0.412801593542099, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 240 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 0.40951448678970337, + "learning_rate": 0.0002, + "loss": 1.8523, + "step": 250 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 0.42788130044937134, + "learning_rate": 0.0002, + "loss": 1.7283, + "step": 260 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 0.41069576144218445, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 270 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 0.3745323717594147, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 280 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 0.3771323263645172, + "learning_rate": 0.0002, + "loss": 1.8484, + "step": 290 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 0.34368929266929626, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 300 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 0.4299296736717224, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 310 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 0.4133922755718231, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 320 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 0.3984859585762024, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 330 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 0.3822788894176483, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 340 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 0.4550061821937561, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 350 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 0.36571192741394043, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 360 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 0.32942914962768555, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 370 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 0.39299526810646057, + "learning_rate": 0.0002, + "loss": 1.7118, + "step": 380 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 0.3817657232284546, + "learning_rate": 0.0002, + "loss": 1.8179, + "step": 390 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 0.3650810122489929, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 0.3736686408519745, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 410 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 0.45680564641952515, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 420 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 0.4154510200023651, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 430 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 0.3701167106628418, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 440 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 0.3869531750679016, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 450 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 0.4391495883464813, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 460 + }, + { + "epoch": 0.8, + "grad_norm": 0.39652755856513977, + "learning_rate": 0.0002, + "loss": 1.7534, + "step": 470 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 0.4096752107143402, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 480 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 0.3857504427433014, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 490 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 0.4105374217033386, + "learning_rate": 0.0002, + "loss": 1.7379, + "step": 500 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 0.3723328113555908, + "learning_rate": 0.0002, + "loss": 1.6391, + "step": 510 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 0.36099690198898315, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 520 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 0.3715187907218933, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 530 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 0.4932813048362732, + "learning_rate": 0.0002, + "loss": 1.7004, + "step": 540 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 0.3493495285511017, + "learning_rate": 0.0002, + "loss": 1.679, + "step": 550 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 0.3598061800003052, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 560 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 0.3521560728549957, + "learning_rate": 0.0002, + "loss": 1.7686, + "step": 570 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 0.34150034189224243, + "learning_rate": 0.0002, + "loss": 1.7391, + "step": 580 + }, + { + "epoch": 0.9991489361702127, + "eval_loss": 1.8388911485671997, + "eval_runtime": 106.6788, + "eval_samples_per_second": 4.828, + "eval_steps_per_second": 0.609, + "step": 587 + } + ], + "logging_steps": 10, + "max_steps": 4696, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.71881845342208e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..40bc5c2205bcd1402ce337f3a218ef4840b3870a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:148f55f87c9f695bdf2cf6d54b37e690c06ff5da5e17bc3af6c1f44f90f45374 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..40bc5c2205bcd1402ce337f3a218ef4840b3870a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:148f55f87c9f695bdf2cf6d54b37e690c06ff5da5e17bc3af6c1f44f90f45374 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..347fd0c1216cae40b6e5470a7123d7c81aeaaa4a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 0.9991489361702127, "step": 587, "epoch_duration": 2050.0382311344147, "total_accumulated_duration": 2050.0382311344147, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5996, "grad_norm": 0.7596228122711182, "learning_rate": 0.0002, "epoch": 0.01702127659574468, "step": 10}, {"loss": 2.2941, "grad_norm": 0.4860903322696686, "learning_rate": 0.0002, "epoch": 0.03404255319148936, "step": 20}, {"loss": 2.0018, "grad_norm": 0.4953401982784271, "learning_rate": 0.0002, "epoch": 0.05106382978723404, "step": 30}, {"loss": 1.9318, "grad_norm": 0.5086901783943176, "learning_rate": 0.0002, "epoch": 0.06808510638297872, "step": 40}, {"loss": 1.8519, "grad_norm": 0.49050021171569824, "learning_rate": 0.0002, "epoch": 0.0851063829787234, "step": 50}, {"loss": 1.8786, "grad_norm": 0.4922358989715576, "learning_rate": 0.0002, "epoch": 0.10212765957446808, "step": 60}, {"loss": 1.8812, "grad_norm": 0.4621541202068329, "learning_rate": 0.0002, "epoch": 0.11914893617021277, "step": 70}, {"loss": 1.8004, "grad_norm": 0.4416729807853699, "learning_rate": 0.0002, "epoch": 0.13617021276595745, "step": 80}, {"loss": 1.9298, "grad_norm": 0.526258111000061, "learning_rate": 0.0002, "epoch": 0.15319148936170213, "step": 90}, {"loss": 1.8339, "grad_norm": 0.44022637605667114, "learning_rate": 0.0002, "epoch": 0.1702127659574468, "step": 100}, {"loss": 1.7681, "grad_norm": 0.4647711515426636, "learning_rate": 0.0002, "epoch": 0.18723404255319148, "step": 110}, {"loss": 1.6953, "grad_norm": 0.4136318564414978, "learning_rate": 0.0002, "epoch": 0.20425531914893616, "step": 120}, {"loss": 1.8491, "grad_norm": 0.39707672595977783, "learning_rate": 0.0002, "epoch": 0.22127659574468084, "step": 130}, {"loss": 1.8017, "grad_norm": 0.4478105306625366, "learning_rate": 0.0002, "epoch": 0.23829787234042554, "step": 140}, {"loss": 1.8067, "grad_norm": 0.4699741303920746, "learning_rate": 0.0002, "epoch": 0.2553191489361702, "step": 150}, {"loss": 1.8161, "grad_norm": 0.4568363130092621, "learning_rate": 0.0002, "epoch": 0.2723404255319149, "step": 160}, {"loss": 1.7491, "grad_norm": 0.45078757405281067, "learning_rate": 0.0002, "epoch": 0.28936170212765955, "step": 170}, {"loss": 1.8115, "grad_norm": 0.4127245843410492, "learning_rate": 0.0002, "epoch": 0.30638297872340425, "step": 180}, {"loss": 1.7707, "grad_norm": 0.4042493402957916, "learning_rate": 0.0002, "epoch": 0.32340425531914896, "step": 190}, {"loss": 1.7807, "grad_norm": 0.401487797498703, "learning_rate": 0.0002, "epoch": 0.3404255319148936, "step": 200}, {"loss": 1.8355, "grad_norm": 0.3959457576274872, "learning_rate": 0.0002, "epoch": 0.3574468085106383, "step": 210}, {"loss": 1.7798, "grad_norm": 0.39865636825561523, "learning_rate": 0.0002, "epoch": 0.37446808510638296, "step": 220}, {"loss": 1.9012, "grad_norm": 0.7225169539451599, "learning_rate": 0.0002, "epoch": 0.39148936170212767, "step": 230}, {"loss": 1.7212, "grad_norm": 0.412801593542099, "learning_rate": 0.0002, "epoch": 0.4085106382978723, "step": 240}, {"loss": 1.8523, "grad_norm": 0.40951448678970337, "learning_rate": 0.0002, "epoch": 0.425531914893617, "step": 250}, {"loss": 1.7283, "grad_norm": 0.42788130044937134, "learning_rate": 0.0002, "epoch": 0.4425531914893617, "step": 260}, {"loss": 1.8051, "grad_norm": 0.41069576144218445, "learning_rate": 0.0002, "epoch": 0.4595744680851064, "step": 270}, {"loss": 1.7861, "grad_norm": 0.3745323717594147, "learning_rate": 0.0002, "epoch": 0.4765957446808511, "step": 280}, {"loss": 1.8484, "grad_norm": 0.3771323263645172, "learning_rate": 0.0002, "epoch": 0.49361702127659574, "step": 290}, {"loss": 1.7417, "grad_norm": 0.34368929266929626, "learning_rate": 0.0002, "epoch": 0.5106382978723404, "step": 300}, {"loss": 1.7394, "grad_norm": 0.4299296736717224, "learning_rate": 0.0002, "epoch": 0.5276595744680851, "step": 310}, {"loss": 1.8255, "grad_norm": 0.4133922755718231, "learning_rate": 0.0002, "epoch": 0.5446808510638298, "step": 320}, {"loss": 1.7043, "grad_norm": 0.3984859585762024, "learning_rate": 0.0002, "epoch": 0.5617021276595745, "step": 330}, {"loss": 1.7873, "grad_norm": 0.3822788894176483, "learning_rate": 0.0002, "epoch": 0.5787234042553191, "step": 340}, {"loss": 1.8082, "grad_norm": 0.4550061821937561, "learning_rate": 0.0002, "epoch": 0.5957446808510638, "step": 350}, {"loss": 1.7179, "grad_norm": 0.36571192741394043, "learning_rate": 0.0002, "epoch": 0.6127659574468085, "step": 360}, {"loss": 1.8196, "grad_norm": 0.32942914962768555, "learning_rate": 0.0002, "epoch": 0.6297872340425532, "step": 370}, {"loss": 1.7118, "grad_norm": 0.39299526810646057, "learning_rate": 0.0002, "epoch": 0.6468085106382979, "step": 380}, {"loss": 1.8179, "grad_norm": 0.3817657232284546, "learning_rate": 0.0002, "epoch": 0.6638297872340425, "step": 390}, {"loss": 1.8174, "grad_norm": 0.3650810122489929, "learning_rate": 0.0002, "epoch": 0.6808510638297872, "step": 400}, {"loss": 1.8026, "grad_norm": 0.3736686408519745, "learning_rate": 0.0002, "epoch": 0.6978723404255319, "step": 410}, {"loss": 1.802, "grad_norm": 0.45680564641952515, "learning_rate": 0.0002, "epoch": 0.7148936170212766, "step": 420}, {"loss": 1.7844, "grad_norm": 0.4154510200023651, "learning_rate": 0.0002, "epoch": 0.7319148936170212, "step": 430}, {"loss": 1.7801, "grad_norm": 0.3701167106628418, "learning_rate": 0.0002, "epoch": 0.7489361702127659, "step": 440}, {"loss": 1.7689, "grad_norm": 0.3869531750679016, "learning_rate": 0.0002, "epoch": 0.7659574468085106, "step": 450}, {"loss": 1.7671, "grad_norm": 0.4391495883464813, "learning_rate": 0.0002, "epoch": 0.7829787234042553, "step": 460}, {"loss": 1.7534, "grad_norm": 0.39652755856513977, "learning_rate": 0.0002, "epoch": 0.8, "step": 470}, {"loss": 1.8076, "grad_norm": 0.4096752107143402, "learning_rate": 0.0002, "epoch": 0.8170212765957446, "step": 480}, {"loss": 1.796, "grad_norm": 0.3857504427433014, "learning_rate": 0.0002, "epoch": 0.8340425531914893, "step": 490}, {"loss": 1.7379, "grad_norm": 0.4105374217033386, "learning_rate": 0.0002, "epoch": 0.851063829787234, "step": 500}, {"loss": 1.6391, "grad_norm": 0.3723328113555908, "learning_rate": 0.0002, "epoch": 0.8680851063829788, "step": 510}, {"loss": 1.7122, "grad_norm": 0.36099690198898315, "learning_rate": 0.0002, "epoch": 0.8851063829787233, "step": 520}, {"loss": 1.7632, "grad_norm": 0.3715187907218933, "learning_rate": 0.0002, "epoch": 0.902127659574468, "step": 530}, {"loss": 1.7004, "grad_norm": 0.4932813048362732, "learning_rate": 0.0002, "epoch": 0.9191489361702128, "step": 540}, {"loss": 1.679, "grad_norm": 0.3493495285511017, "learning_rate": 0.0002, "epoch": 0.9361702127659575, "step": 550}, {"loss": 1.7758, "grad_norm": 0.3598061800003052, "learning_rate": 0.0002, "epoch": 0.9531914893617022, "step": 560}, {"loss": 1.7686, "grad_norm": 0.3521560728549957, "learning_rate": 0.0002, "epoch": 0.9702127659574468, "step": 570}, {"loss": 1.7391, "grad_norm": 0.34150034189224243, "learning_rate": 0.0002, "epoch": 0.9872340425531915, "step": 580}]} +{"epoch": 2.0, "step": 1175, "epoch_duration": 2054.381093263626, "total_accumulated_duration": 4104.419324398041, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-587", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5996, "grad_norm": 0.7596228122711182, "learning_rate": 0.0002, "epoch": 0.01702127659574468, "step": 10}, {"loss": 2.2941, "grad_norm": 0.4860903322696686, "learning_rate": 0.0002, "epoch": 0.03404255319148936, "step": 20}, {"loss": 2.0018, "grad_norm": 0.4953401982784271, "learning_rate": 0.0002, "epoch": 0.05106382978723404, "step": 30}, {"loss": 1.9318, "grad_norm": 0.5086901783943176, "learning_rate": 0.0002, "epoch": 0.06808510638297872, "step": 40}, {"loss": 1.8519, "grad_norm": 0.49050021171569824, "learning_rate": 0.0002, "epoch": 0.0851063829787234, "step": 50}, {"loss": 1.8786, "grad_norm": 0.4922358989715576, "learning_rate": 0.0002, "epoch": 0.10212765957446808, "step": 60}, {"loss": 1.8812, "grad_norm": 0.4621541202068329, "learning_rate": 0.0002, "epoch": 0.11914893617021277, "step": 70}, {"loss": 1.8004, "grad_norm": 0.4416729807853699, "learning_rate": 0.0002, "epoch": 0.13617021276595745, "step": 80}, {"loss": 1.9298, "grad_norm": 0.526258111000061, "learning_rate": 0.0002, "epoch": 0.15319148936170213, "step": 90}, {"loss": 1.8339, "grad_norm": 0.44022637605667114, "learning_rate": 0.0002, "epoch": 0.1702127659574468, "step": 100}, {"loss": 1.7681, "grad_norm": 0.4647711515426636, "learning_rate": 0.0002, "epoch": 0.18723404255319148, "step": 110}, {"loss": 1.6953, "grad_norm": 0.4136318564414978, "learning_rate": 0.0002, "epoch": 0.20425531914893616, "step": 120}, {"loss": 1.8491, "grad_norm": 0.39707672595977783, "learning_rate": 0.0002, "epoch": 0.22127659574468084, "step": 130}, {"loss": 1.8017, "grad_norm": 0.4478105306625366, "learning_rate": 0.0002, "epoch": 0.23829787234042554, "step": 140}, {"loss": 1.8067, "grad_norm": 0.4699741303920746, "learning_rate": 0.0002, "epoch": 0.2553191489361702, "step": 150}, {"loss": 1.8161, "grad_norm": 0.4568363130092621, "learning_rate": 0.0002, "epoch": 0.2723404255319149, "step": 160}, {"loss": 1.7491, "grad_norm": 0.45078757405281067, "learning_rate": 0.0002, "epoch": 0.28936170212765955, "step": 170}, {"loss": 1.8115, "grad_norm": 0.4127245843410492, "learning_rate": 0.0002, "epoch": 0.30638297872340425, "step": 180}, {"loss": 1.7707, "grad_norm": 0.4042493402957916, "learning_rate": 0.0002, "epoch": 0.32340425531914896, "step": 190}, {"loss": 1.7807, "grad_norm": 0.401487797498703, "learning_rate": 0.0002, "epoch": 0.3404255319148936, "step": 200}, {"loss": 1.8355, "grad_norm": 0.3959457576274872, "learning_rate": 0.0002, "epoch": 0.3574468085106383, "step": 210}, {"loss": 1.7798, "grad_norm": 0.39865636825561523, "learning_rate": 0.0002, "epoch": 0.37446808510638296, "step": 220}, {"loss": 1.9012, "grad_norm": 0.7225169539451599, "learning_rate": 0.0002, "epoch": 0.39148936170212767, "step": 230}, {"loss": 1.7212, "grad_norm": 0.412801593542099, "learning_rate": 0.0002, "epoch": 0.4085106382978723, "step": 240}, {"loss": 1.8523, "grad_norm": 0.40951448678970337, "learning_rate": 0.0002, "epoch": 0.425531914893617, "step": 250}, {"loss": 1.7283, "grad_norm": 0.42788130044937134, "learning_rate": 0.0002, "epoch": 0.4425531914893617, "step": 260}, {"loss": 1.8051, "grad_norm": 0.41069576144218445, "learning_rate": 0.0002, "epoch": 0.4595744680851064, "step": 270}, {"loss": 1.7861, "grad_norm": 0.3745323717594147, "learning_rate": 0.0002, "epoch": 0.4765957446808511, "step": 280}, {"loss": 1.8484, "grad_norm": 0.3771323263645172, "learning_rate": 0.0002, "epoch": 0.49361702127659574, "step": 290}, {"loss": 1.7417, "grad_norm": 0.34368929266929626, "learning_rate": 0.0002, "epoch": 0.5106382978723404, "step": 300}, {"loss": 1.7394, "grad_norm": 0.4299296736717224, "learning_rate": 0.0002, "epoch": 0.5276595744680851, "step": 310}, {"loss": 1.8255, "grad_norm": 0.4133922755718231, "learning_rate": 0.0002, "epoch": 0.5446808510638298, "step": 320}, {"loss": 1.7043, "grad_norm": 0.3984859585762024, "learning_rate": 0.0002, "epoch": 0.5617021276595745, "step": 330}, {"loss": 1.7873, "grad_norm": 0.3822788894176483, "learning_rate": 0.0002, "epoch": 0.5787234042553191, "step": 340}, {"loss": 1.8082, "grad_norm": 0.4550061821937561, "learning_rate": 0.0002, "epoch": 0.5957446808510638, "step": 350}, {"loss": 1.7179, "grad_norm": 0.36571192741394043, "learning_rate": 0.0002, "epoch": 0.6127659574468085, "step": 360}, {"loss": 1.8196, "grad_norm": 0.32942914962768555, "learning_rate": 0.0002, "epoch": 0.6297872340425532, "step": 370}, {"loss": 1.7118, "grad_norm": 0.39299526810646057, "learning_rate": 0.0002, "epoch": 0.6468085106382979, "step": 380}, {"loss": 1.8179, "grad_norm": 0.3817657232284546, "learning_rate": 0.0002, "epoch": 0.6638297872340425, "step": 390}, {"loss": 1.8174, "grad_norm": 0.3650810122489929, "learning_rate": 0.0002, "epoch": 0.6808510638297872, "step": 400}, {"loss": 1.8026, "grad_norm": 0.3736686408519745, "learning_rate": 0.0002, "epoch": 0.6978723404255319, "step": 410}, {"loss": 1.802, "grad_norm": 0.45680564641952515, "learning_rate": 0.0002, "epoch": 0.7148936170212766, "step": 420}, {"loss": 1.7844, "grad_norm": 0.4154510200023651, "learning_rate": 0.0002, "epoch": 0.7319148936170212, "step": 430}, {"loss": 1.7801, "grad_norm": 0.3701167106628418, "learning_rate": 0.0002, "epoch": 0.7489361702127659, "step": 440}, {"loss": 1.7689, "grad_norm": 0.3869531750679016, "learning_rate": 0.0002, "epoch": 0.7659574468085106, "step": 450}, {"loss": 1.7671, "grad_norm": 0.4391495883464813, "learning_rate": 0.0002, "epoch": 0.7829787234042553, "step": 460}, {"loss": 1.7534, "grad_norm": 0.39652755856513977, "learning_rate": 0.0002, "epoch": 0.8, "step": 470}, {"loss": 1.8076, "grad_norm": 0.4096752107143402, "learning_rate": 0.0002, "epoch": 0.8170212765957446, "step": 480}, {"loss": 1.796, "grad_norm": 0.3857504427433014, "learning_rate": 0.0002, "epoch": 0.8340425531914893, "step": 490}, {"loss": 1.7379, "grad_norm": 0.4105374217033386, "learning_rate": 0.0002, "epoch": 0.851063829787234, "step": 500}, {"loss": 1.6391, "grad_norm": 0.3723328113555908, "learning_rate": 0.0002, "epoch": 0.8680851063829788, "step": 510}, {"loss": 1.7122, "grad_norm": 0.36099690198898315, "learning_rate": 0.0002, "epoch": 0.8851063829787233, "step": 520}, {"loss": 1.7632, "grad_norm": 0.3715187907218933, "learning_rate": 0.0002, "epoch": 0.902127659574468, "step": 530}, {"loss": 1.7004, "grad_norm": 0.4932813048362732, "learning_rate": 0.0002, "epoch": 0.9191489361702128, "step": 540}, {"loss": 1.679, "grad_norm": 0.3493495285511017, "learning_rate": 0.0002, "epoch": 0.9361702127659575, "step": 550}, {"loss": 1.7758, "grad_norm": 0.3598061800003052, "learning_rate": 0.0002, "epoch": 0.9531914893617022, "step": 560}, {"loss": 1.7686, "grad_norm": 0.3521560728549957, "learning_rate": 0.0002, "epoch": 0.9702127659574468, "step": 570}, {"loss": 1.7391, "grad_norm": 0.34150034189224243, "learning_rate": 0.0002, "epoch": 0.9872340425531915, "step": 580}, {"eval_loss": 1.8388911485671997, "eval_runtime": 106.6788, "eval_samples_per_second": 4.828, "eval_steps_per_second": 0.609, "epoch": 0.9991489361702127, "step": 587}, {"loss": 1.6115, "grad_norm": 0.33556103706359863, "learning_rate": 0.0002, "epoch": 1.004255319148936, "step": 590}, {"loss": 1.8346, "grad_norm": 0.4333398640155792, "learning_rate": 0.0002, "epoch": 1.0212765957446808, "step": 600}, {"loss": 1.6656, "grad_norm": 0.38488736748695374, "learning_rate": 0.0002, "epoch": 1.0382978723404255, "step": 610}, {"loss": 1.7778, "grad_norm": 0.44454529881477356, "learning_rate": 0.0002, "epoch": 1.0553191489361702, "step": 620}, {"loss": 1.7143, "grad_norm": 0.3735603392124176, "learning_rate": 0.0002, "epoch": 1.0723404255319149, "step": 630}, {"loss": 1.74, "grad_norm": 0.38912704586982727, "learning_rate": 0.0002, "epoch": 1.0893617021276596, "step": 640}, {"loss": 1.607, "grad_norm": 0.4411826431751251, "learning_rate": 0.0002, "epoch": 1.1063829787234043, "step": 650}, {"loss": 1.6901, "grad_norm": 0.4163050353527069, "learning_rate": 0.0002, "epoch": 1.123404255319149, "step": 660}, {"loss": 1.6722, "grad_norm": 0.4187192916870117, "learning_rate": 0.0002, "epoch": 1.1404255319148937, "step": 670}, {"loss": 1.7061, "grad_norm": 0.3797093629837036, "learning_rate": 0.0002, "epoch": 1.1574468085106382, "step": 680}, {"loss": 1.6409, "grad_norm": 0.4210026264190674, "learning_rate": 0.0002, "epoch": 1.174468085106383, "step": 690}, {"loss": 1.6343, "grad_norm": 0.4701998829841614, "learning_rate": 0.0002, "epoch": 1.1914893617021276, "step": 700}, {"loss": 1.6773, "grad_norm": 0.6331578493118286, "learning_rate": 0.0002, "epoch": 1.2085106382978723, "step": 710}, {"loss": 1.6776, "grad_norm": 0.41908255219459534, "learning_rate": 0.0002, "epoch": 1.225531914893617, "step": 720}, {"loss": 1.6468, "grad_norm": 0.36158403754234314, "learning_rate": 0.0002, "epoch": 1.2425531914893617, "step": 730}, {"loss": 1.746, "grad_norm": 0.387300580739975, "learning_rate": 0.0002, "epoch": 1.2595744680851064, "step": 740}, {"loss": 1.7089, "grad_norm": 0.38899728655815125, "learning_rate": 0.0002, "epoch": 1.2765957446808511, "step": 750}, {"loss": 1.6376, "grad_norm": 0.4549255073070526, "learning_rate": 0.0002, "epoch": 1.2936170212765958, "step": 760}, {"loss": 1.6754, "grad_norm": 0.4052349328994751, "learning_rate": 0.0002, "epoch": 1.3106382978723405, "step": 770}, {"loss": 1.6483, "grad_norm": 0.38934215903282166, "learning_rate": 0.0002, "epoch": 1.327659574468085, "step": 780}, {"loss": 1.7813, "grad_norm": 0.38688382506370544, "learning_rate": 0.0002, "epoch": 1.3446808510638297, "step": 790}, {"loss": 1.6374, "grad_norm": 0.3825705051422119, "learning_rate": 0.0002, "epoch": 1.3617021276595744, "step": 800}, {"loss": 1.655, "grad_norm": 0.37331756949424744, "learning_rate": 0.0002, "epoch": 1.3787234042553191, "step": 810}, {"loss": 1.6321, "grad_norm": 0.38826408982276917, "learning_rate": 0.0002, "epoch": 1.3957446808510638, "step": 820}, {"loss": 1.6107, "grad_norm": 0.4213569164276123, "learning_rate": 0.0002, "epoch": 1.4127659574468086, "step": 830}, {"loss": 1.6792, "grad_norm": 0.3976684808731079, "learning_rate": 0.0002, "epoch": 1.4297872340425533, "step": 840}, {"loss": 1.7029, "grad_norm": 0.390009343624115, "learning_rate": 0.0002, "epoch": 1.4468085106382977, "step": 850}, {"loss": 1.7489, "grad_norm": 0.4462052583694458, "learning_rate": 0.0002, "epoch": 1.4638297872340424, "step": 860}, {"loss": 1.7513, "grad_norm": 0.42129236459732056, "learning_rate": 0.0002, "epoch": 1.4808510638297872, "step": 870}, {"loss": 1.6009, "grad_norm": 0.41489893198013306, "learning_rate": 0.0002, "epoch": 1.4978723404255319, "step": 880}, {"loss": 1.7129, "grad_norm": 0.41451677680015564, "learning_rate": 0.0002, "epoch": 1.5148936170212766, "step": 890}, {"loss": 1.6427, "grad_norm": 0.4477299749851227, "learning_rate": 0.0002, "epoch": 1.5319148936170213, "step": 900}, {"loss": 1.6543, "grad_norm": 0.38476648926734924, "learning_rate": 0.0002, "epoch": 1.548936170212766, "step": 910}, {"loss": 1.7103, "grad_norm": 0.42755743861198425, "learning_rate": 0.0002, "epoch": 1.5659574468085107, "step": 920}, {"loss": 1.7014, "grad_norm": 0.39372023940086365, "learning_rate": 0.0002, "epoch": 1.5829787234042554, "step": 930}, {"loss": 1.7042, "grad_norm": 0.42778754234313965, "learning_rate": 0.0002, "epoch": 1.6, "step": 940}, {"loss": 1.7488, "grad_norm": 0.4217268228530884, "learning_rate": 0.0002, "epoch": 1.6170212765957448, "step": 950}, {"loss": 1.6556, "grad_norm": 0.40452107787132263, "learning_rate": 0.0002, "epoch": 1.6340425531914895, "step": 960}, {"loss": 1.6734, "grad_norm": 0.4259980022907257, "learning_rate": 0.0002, "epoch": 1.6510638297872342, "step": 970}, {"loss": 1.6464, "grad_norm": 0.4089849591255188, "learning_rate": 0.0002, "epoch": 1.6680851063829787, "step": 980}, {"loss": 1.6588, "grad_norm": 0.38276049494743347, "learning_rate": 0.0002, "epoch": 1.6851063829787234, "step": 990}, {"loss": 1.6846, "grad_norm": 0.40361565351486206, "learning_rate": 0.0002, "epoch": 1.702127659574468, "step": 1000}, {"loss": 1.698, "grad_norm": 0.3537807762622833, "learning_rate": 0.0002, "epoch": 1.7191489361702128, "step": 1010}, {"loss": 1.6001, "grad_norm": 0.40288347005844116, "learning_rate": 0.0002, "epoch": 1.7361702127659573, "step": 1020}, {"loss": 1.6375, "grad_norm": 0.4003616273403168, "learning_rate": 0.0002, "epoch": 1.753191489361702, "step": 1030}, {"loss": 1.6067, "grad_norm": 0.3931669592857361, "learning_rate": 0.0002, "epoch": 1.7702127659574467, "step": 1040}, {"loss": 1.6814, "grad_norm": 0.4001635015010834, "learning_rate": 0.0002, "epoch": 1.7872340425531914, "step": 1050}, {"loss": 1.6158, "grad_norm": 0.4139048457145691, "learning_rate": 0.0002, "epoch": 1.804255319148936, "step": 1060}, {"loss": 1.7513, "grad_norm": 0.5044458508491516, "learning_rate": 0.0002, "epoch": 1.8212765957446808, "step": 1070}, {"loss": 1.7317, "grad_norm": 0.4827095568180084, "learning_rate": 0.0002, "epoch": 1.8382978723404255, "step": 1080}, {"loss": 1.6586, "grad_norm": 0.3750515282154083, "learning_rate": 0.0002, "epoch": 1.8553191489361702, "step": 1090}, {"loss": 1.653, "grad_norm": 0.4024597704410553, "learning_rate": 0.0002, "epoch": 1.872340425531915, "step": 1100}, {"loss": 1.7138, "grad_norm": 0.36747241020202637, "learning_rate": 0.0002, "epoch": 1.8893617021276596, "step": 1110}, {"loss": 1.652, "grad_norm": 0.41397711634635925, "learning_rate": 0.0002, "epoch": 1.9063829787234043, "step": 1120}, {"loss": 1.7071, "grad_norm": 0.3960763216018677, "learning_rate": 0.0002, "epoch": 1.923404255319149, "step": 1130}, {"loss": 1.6857, "grad_norm": 0.4533233344554901, "learning_rate": 0.0002, "epoch": 1.9404255319148938, "step": 1140}, {"loss": 1.7168, "grad_norm": 0.38433438539505005, "learning_rate": 0.0002, "epoch": 1.9574468085106385, "step": 1150}, {"loss": 1.7444, "grad_norm": 0.3648812174797058, "learning_rate": 0.0002, "epoch": 1.974468085106383, "step": 1160}, {"loss": 1.6521, "grad_norm": 0.3887176215648651, "learning_rate": 0.0002, "epoch": 1.9914893617021276, "step": 1170}]} +{"epoch": 2.999148936170213, "step": 1762, "epoch_duration": 2055.5786373615265, "total_accumulated_duration": 6159.997961759567, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5996, "grad_norm": 0.7596228122711182, "learning_rate": 0.0002, "epoch": 0.01702127659574468, "step": 10}, {"loss": 2.2941, "grad_norm": 0.4860903322696686, "learning_rate": 0.0002, "epoch": 0.03404255319148936, "step": 20}, {"loss": 2.0018, "grad_norm": 0.4953401982784271, "learning_rate": 0.0002, "epoch": 0.05106382978723404, "step": 30}, {"loss": 1.9318, "grad_norm": 0.5086901783943176, "learning_rate": 0.0002, "epoch": 0.06808510638297872, "step": 40}, {"loss": 1.8519, "grad_norm": 0.49050021171569824, "learning_rate": 0.0002, "epoch": 0.0851063829787234, "step": 50}, {"loss": 1.8786, "grad_norm": 0.4922358989715576, "learning_rate": 0.0002, "epoch": 0.10212765957446808, "step": 60}, {"loss": 1.8812, "grad_norm": 0.4621541202068329, "learning_rate": 0.0002, "epoch": 0.11914893617021277, "step": 70}, {"loss": 1.8004, "grad_norm": 0.4416729807853699, "learning_rate": 0.0002, "epoch": 0.13617021276595745, "step": 80}, {"loss": 1.9298, "grad_norm": 0.526258111000061, "learning_rate": 0.0002, "epoch": 0.15319148936170213, "step": 90}, {"loss": 1.8339, "grad_norm": 0.44022637605667114, "learning_rate": 0.0002, "epoch": 0.1702127659574468, "step": 100}, {"loss": 1.7681, "grad_norm": 0.4647711515426636, "learning_rate": 0.0002, "epoch": 0.18723404255319148, "step": 110}, {"loss": 1.6953, "grad_norm": 0.4136318564414978, "learning_rate": 0.0002, "epoch": 0.20425531914893616, "step": 120}, {"loss": 1.8491, "grad_norm": 0.39707672595977783, "learning_rate": 0.0002, "epoch": 0.22127659574468084, "step": 130}, {"loss": 1.8017, "grad_norm": 0.4478105306625366, "learning_rate": 0.0002, "epoch": 0.23829787234042554, "step": 140}, {"loss": 1.8067, "grad_norm": 0.4699741303920746, "learning_rate": 0.0002, "epoch": 0.2553191489361702, "step": 150}, {"loss": 1.8161, "grad_norm": 0.4568363130092621, "learning_rate": 0.0002, "epoch": 0.2723404255319149, "step": 160}, {"loss": 1.7491, "grad_norm": 0.45078757405281067, "learning_rate": 0.0002, "epoch": 0.28936170212765955, "step": 170}, {"loss": 1.8115, "grad_norm": 0.4127245843410492, "learning_rate": 0.0002, "epoch": 0.30638297872340425, "step": 180}, {"loss": 1.7707, "grad_norm": 0.4042493402957916, "learning_rate": 0.0002, "epoch": 0.32340425531914896, "step": 190}, {"loss": 1.7807, "grad_norm": 0.401487797498703, "learning_rate": 0.0002, "epoch": 0.3404255319148936, "step": 200}, {"loss": 1.8355, "grad_norm": 0.3959457576274872, "learning_rate": 0.0002, "epoch": 0.3574468085106383, "step": 210}, {"loss": 1.7798, "grad_norm": 0.39865636825561523, "learning_rate": 0.0002, "epoch": 0.37446808510638296, "step": 220}, {"loss": 1.9012, "grad_norm": 0.7225169539451599, "learning_rate": 0.0002, "epoch": 0.39148936170212767, "step": 230}, {"loss": 1.7212, "grad_norm": 0.412801593542099, "learning_rate": 0.0002, "epoch": 0.4085106382978723, "step": 240}, {"loss": 1.8523, "grad_norm": 0.40951448678970337, "learning_rate": 0.0002, "epoch": 0.425531914893617, "step": 250}, {"loss": 1.7283, "grad_norm": 0.42788130044937134, "learning_rate": 0.0002, "epoch": 0.4425531914893617, "step": 260}, {"loss": 1.8051, "grad_norm": 0.41069576144218445, "learning_rate": 0.0002, "epoch": 0.4595744680851064, "step": 270}, {"loss": 1.7861, "grad_norm": 0.3745323717594147, "learning_rate": 0.0002, "epoch": 0.4765957446808511, "step": 280}, {"loss": 1.8484, "grad_norm": 0.3771323263645172, "learning_rate": 0.0002, "epoch": 0.49361702127659574, "step": 290}, {"loss": 1.7417, "grad_norm": 0.34368929266929626, "learning_rate": 0.0002, "epoch": 0.5106382978723404, "step": 300}, {"loss": 1.7394, "grad_norm": 0.4299296736717224, "learning_rate": 0.0002, "epoch": 0.5276595744680851, "step": 310}, {"loss": 1.8255, "grad_norm": 0.4133922755718231, "learning_rate": 0.0002, "epoch": 0.5446808510638298, "step": 320}, {"loss": 1.7043, "grad_norm": 0.3984859585762024, "learning_rate": 0.0002, "epoch": 0.5617021276595745, "step": 330}, {"loss": 1.7873, "grad_norm": 0.3822788894176483, "learning_rate": 0.0002, "epoch": 0.5787234042553191, "step": 340}, {"loss": 1.8082, "grad_norm": 0.4550061821937561, "learning_rate": 0.0002, "epoch": 0.5957446808510638, "step": 350}, {"loss": 1.7179, "grad_norm": 0.36571192741394043, "learning_rate": 0.0002, "epoch": 0.6127659574468085, "step": 360}, {"loss": 1.8196, "grad_norm": 0.32942914962768555, "learning_rate": 0.0002, "epoch": 0.6297872340425532, "step": 370}, {"loss": 1.7118, "grad_norm": 0.39299526810646057, "learning_rate": 0.0002, "epoch": 0.6468085106382979, "step": 380}, {"loss": 1.8179, "grad_norm": 0.3817657232284546, "learning_rate": 0.0002, "epoch": 0.6638297872340425, "step": 390}, {"loss": 1.8174, "grad_norm": 0.3650810122489929, "learning_rate": 0.0002, "epoch": 0.6808510638297872, "step": 400}, {"loss": 1.8026, "grad_norm": 0.3736686408519745, "learning_rate": 0.0002, "epoch": 0.6978723404255319, "step": 410}, {"loss": 1.802, "grad_norm": 0.45680564641952515, "learning_rate": 0.0002, "epoch": 0.7148936170212766, "step": 420}, {"loss": 1.7844, "grad_norm": 0.4154510200023651, "learning_rate": 0.0002, "epoch": 0.7319148936170212, "step": 430}, {"loss": 1.7801, "grad_norm": 0.3701167106628418, "learning_rate": 0.0002, "epoch": 0.7489361702127659, "step": 440}, {"loss": 1.7689, "grad_norm": 0.3869531750679016, "learning_rate": 0.0002, "epoch": 0.7659574468085106, "step": 450}, {"loss": 1.7671, "grad_norm": 0.4391495883464813, "learning_rate": 0.0002, "epoch": 0.7829787234042553, "step": 460}, {"loss": 1.7534, "grad_norm": 0.39652755856513977, "learning_rate": 0.0002, "epoch": 0.8, "step": 470}, {"loss": 1.8076, "grad_norm": 0.4096752107143402, "learning_rate": 0.0002, "epoch": 0.8170212765957446, "step": 480}, {"loss": 1.796, "grad_norm": 0.3857504427433014, "learning_rate": 0.0002, "epoch": 0.8340425531914893, "step": 490}, {"loss": 1.7379, "grad_norm": 0.4105374217033386, "learning_rate": 0.0002, "epoch": 0.851063829787234, "step": 500}, {"loss": 1.6391, "grad_norm": 0.3723328113555908, "learning_rate": 0.0002, "epoch": 0.8680851063829788, "step": 510}, {"loss": 1.7122, "grad_norm": 0.36099690198898315, "learning_rate": 0.0002, "epoch": 0.8851063829787233, "step": 520}, {"loss": 1.7632, "grad_norm": 0.3715187907218933, "learning_rate": 0.0002, "epoch": 0.902127659574468, "step": 530}, {"loss": 1.7004, "grad_norm": 0.4932813048362732, "learning_rate": 0.0002, "epoch": 0.9191489361702128, "step": 540}, {"loss": 1.679, "grad_norm": 0.3493495285511017, "learning_rate": 0.0002, "epoch": 0.9361702127659575, "step": 550}, {"loss": 1.7758, "grad_norm": 0.3598061800003052, "learning_rate": 0.0002, "epoch": 0.9531914893617022, "step": 560}, {"loss": 1.7686, "grad_norm": 0.3521560728549957, "learning_rate": 0.0002, "epoch": 0.9702127659574468, "step": 570}, {"loss": 1.7391, "grad_norm": 0.34150034189224243, "learning_rate": 0.0002, "epoch": 0.9872340425531915, "step": 580}, {"eval_loss": 1.8388911485671997, "eval_runtime": 106.6788, "eval_samples_per_second": 4.828, "eval_steps_per_second": 0.609, "epoch": 0.9991489361702127, "step": 587}, {"loss": 1.6115, "grad_norm": 0.33556103706359863, "learning_rate": 0.0002, "epoch": 1.004255319148936, "step": 590}, {"loss": 1.8346, "grad_norm": 0.4333398640155792, "learning_rate": 0.0002, "epoch": 1.0212765957446808, "step": 600}, {"loss": 1.6656, "grad_norm": 0.38488736748695374, "learning_rate": 0.0002, "epoch": 1.0382978723404255, "step": 610}, {"loss": 1.7778, "grad_norm": 0.44454529881477356, "learning_rate": 0.0002, "epoch": 1.0553191489361702, "step": 620}, {"loss": 1.7143, "grad_norm": 0.3735603392124176, "learning_rate": 0.0002, "epoch": 1.0723404255319149, "step": 630}, {"loss": 1.74, "grad_norm": 0.38912704586982727, "learning_rate": 0.0002, "epoch": 1.0893617021276596, "step": 640}, {"loss": 1.607, "grad_norm": 0.4411826431751251, "learning_rate": 0.0002, "epoch": 1.1063829787234043, "step": 650}, {"loss": 1.6901, "grad_norm": 0.4163050353527069, "learning_rate": 0.0002, "epoch": 1.123404255319149, "step": 660}, {"loss": 1.6722, "grad_norm": 0.4187192916870117, "learning_rate": 0.0002, "epoch": 1.1404255319148937, "step": 670}, {"loss": 1.7061, "grad_norm": 0.3797093629837036, "learning_rate": 0.0002, "epoch": 1.1574468085106382, "step": 680}, {"loss": 1.6409, "grad_norm": 0.4210026264190674, "learning_rate": 0.0002, "epoch": 1.174468085106383, "step": 690}, {"loss": 1.6343, "grad_norm": 0.4701998829841614, "learning_rate": 0.0002, "epoch": 1.1914893617021276, "step": 700}, {"loss": 1.6773, "grad_norm": 0.6331578493118286, "learning_rate": 0.0002, "epoch": 1.2085106382978723, "step": 710}, {"loss": 1.6776, "grad_norm": 0.41908255219459534, "learning_rate": 0.0002, "epoch": 1.225531914893617, "step": 720}, {"loss": 1.6468, "grad_norm": 0.36158403754234314, "learning_rate": 0.0002, "epoch": 1.2425531914893617, "step": 730}, {"loss": 1.746, "grad_norm": 0.387300580739975, "learning_rate": 0.0002, "epoch": 1.2595744680851064, "step": 740}, {"loss": 1.7089, "grad_norm": 0.38899728655815125, "learning_rate": 0.0002, "epoch": 1.2765957446808511, "step": 750}, {"loss": 1.6376, "grad_norm": 0.4549255073070526, "learning_rate": 0.0002, "epoch": 1.2936170212765958, "step": 760}, {"loss": 1.6754, "grad_norm": 0.4052349328994751, "learning_rate": 0.0002, "epoch": 1.3106382978723405, "step": 770}, {"loss": 1.6483, "grad_norm": 0.38934215903282166, "learning_rate": 0.0002, "epoch": 1.327659574468085, "step": 780}, {"loss": 1.7813, "grad_norm": 0.38688382506370544, "learning_rate": 0.0002, "epoch": 1.3446808510638297, "step": 790}, {"loss": 1.6374, "grad_norm": 0.3825705051422119, "learning_rate": 0.0002, "epoch": 1.3617021276595744, "step": 800}, {"loss": 1.655, "grad_norm": 0.37331756949424744, "learning_rate": 0.0002, "epoch": 1.3787234042553191, "step": 810}, {"loss": 1.6321, "grad_norm": 0.38826408982276917, "learning_rate": 0.0002, "epoch": 1.3957446808510638, "step": 820}, {"loss": 1.6107, "grad_norm": 0.4213569164276123, "learning_rate": 0.0002, "epoch": 1.4127659574468086, "step": 830}, {"loss": 1.6792, "grad_norm": 0.3976684808731079, "learning_rate": 0.0002, "epoch": 1.4297872340425533, "step": 840}, {"loss": 1.7029, "grad_norm": 0.390009343624115, "learning_rate": 0.0002, "epoch": 1.4468085106382977, "step": 850}, {"loss": 1.7489, "grad_norm": 0.4462052583694458, "learning_rate": 0.0002, "epoch": 1.4638297872340424, "step": 860}, {"loss": 1.7513, "grad_norm": 0.42129236459732056, "learning_rate": 0.0002, "epoch": 1.4808510638297872, "step": 870}, {"loss": 1.6009, "grad_norm": 0.41489893198013306, "learning_rate": 0.0002, "epoch": 1.4978723404255319, "step": 880}, {"loss": 1.7129, "grad_norm": 0.41451677680015564, "learning_rate": 0.0002, "epoch": 1.5148936170212766, "step": 890}, {"loss": 1.6427, "grad_norm": 0.4477299749851227, "learning_rate": 0.0002, "epoch": 1.5319148936170213, "step": 900}, {"loss": 1.6543, "grad_norm": 0.38476648926734924, "learning_rate": 0.0002, "epoch": 1.548936170212766, "step": 910}, {"loss": 1.7103, "grad_norm": 0.42755743861198425, "learning_rate": 0.0002, "epoch": 1.5659574468085107, "step": 920}, {"loss": 1.7014, "grad_norm": 0.39372023940086365, "learning_rate": 0.0002, "epoch": 1.5829787234042554, "step": 930}, {"loss": 1.7042, "grad_norm": 0.42778754234313965, "learning_rate": 0.0002, "epoch": 1.6, "step": 940}, {"loss": 1.7488, "grad_norm": 0.4217268228530884, "learning_rate": 0.0002, "epoch": 1.6170212765957448, "step": 950}, {"loss": 1.6556, "grad_norm": 0.40452107787132263, "learning_rate": 0.0002, "epoch": 1.6340425531914895, "step": 960}, {"loss": 1.6734, "grad_norm": 0.4259980022907257, "learning_rate": 0.0002, "epoch": 1.6510638297872342, "step": 970}, {"loss": 1.6464, "grad_norm": 0.4089849591255188, "learning_rate": 0.0002, "epoch": 1.6680851063829787, "step": 980}, {"loss": 1.6588, "grad_norm": 0.38276049494743347, "learning_rate": 0.0002, "epoch": 1.6851063829787234, "step": 990}, {"loss": 1.6846, "grad_norm": 0.40361565351486206, "learning_rate": 0.0002, "epoch": 1.702127659574468, "step": 1000}, {"loss": 1.698, "grad_norm": 0.3537807762622833, "learning_rate": 0.0002, "epoch": 1.7191489361702128, "step": 1010}, {"loss": 1.6001, "grad_norm": 0.40288347005844116, "learning_rate": 0.0002, "epoch": 1.7361702127659573, "step": 1020}, {"loss": 1.6375, "grad_norm": 0.4003616273403168, "learning_rate": 0.0002, "epoch": 1.753191489361702, "step": 1030}, {"loss": 1.6067, "grad_norm": 0.3931669592857361, "learning_rate": 0.0002, "epoch": 1.7702127659574467, "step": 1040}, {"loss": 1.6814, "grad_norm": 0.4001635015010834, "learning_rate": 0.0002, "epoch": 1.7872340425531914, "step": 1050}, {"loss": 1.6158, "grad_norm": 0.4139048457145691, "learning_rate": 0.0002, "epoch": 1.804255319148936, "step": 1060}, {"loss": 1.7513, "grad_norm": 0.5044458508491516, "learning_rate": 0.0002, "epoch": 1.8212765957446808, "step": 1070}, {"loss": 1.7317, "grad_norm": 0.4827095568180084, "learning_rate": 0.0002, "epoch": 1.8382978723404255, "step": 1080}, {"loss": 1.6586, "grad_norm": 0.3750515282154083, "learning_rate": 0.0002, "epoch": 1.8553191489361702, "step": 1090}, {"loss": 1.653, "grad_norm": 0.4024597704410553, "learning_rate": 0.0002, "epoch": 1.872340425531915, "step": 1100}, {"loss": 1.7138, "grad_norm": 0.36747241020202637, "learning_rate": 0.0002, "epoch": 1.8893617021276596, "step": 1110}, {"loss": 1.652, "grad_norm": 0.41397711634635925, "learning_rate": 0.0002, "epoch": 1.9063829787234043, "step": 1120}, {"loss": 1.7071, "grad_norm": 0.3960763216018677, "learning_rate": 0.0002, "epoch": 1.923404255319149, "step": 1130}, {"loss": 1.6857, "grad_norm": 0.4533233344554901, "learning_rate": 0.0002, "epoch": 1.9404255319148938, "step": 1140}, {"loss": 1.7168, "grad_norm": 0.38433438539505005, "learning_rate": 0.0002, "epoch": 1.9574468085106385, "step": 1150}, {"loss": 1.7444, "grad_norm": 0.3648812174797058, "learning_rate": 0.0002, "epoch": 1.974468085106383, "step": 1160}, {"loss": 1.6521, "grad_norm": 0.3887176215648651, "learning_rate": 0.0002, "epoch": 1.9914893617021276, "step": 1170}, {"eval_loss": 1.8328146934509277, "eval_runtime": 107.2842, "eval_samples_per_second": 4.8, "eval_steps_per_second": 0.606, "epoch": 2.0, "step": 1175}, {"loss": 1.6184, "grad_norm": 0.40444880723953247, "learning_rate": 0.0002, "epoch": 2.008510638297872, "step": 1180}, {"loss": 1.5221, "grad_norm": 0.3997816741466522, "learning_rate": 0.0002, "epoch": 2.025531914893617, "step": 1190}, {"loss": 1.6022, "grad_norm": 0.4516718089580536, "learning_rate": 0.0002, "epoch": 2.0425531914893615, "step": 1200}, {"loss": 1.5458, "grad_norm": 0.6645553708076477, "learning_rate": 0.0002, "epoch": 2.0595744680851062, "step": 1210}, {"loss": 1.5985, "grad_norm": 0.4181990921497345, "learning_rate": 0.0002, "epoch": 2.076595744680851, "step": 1220}, {"loss": 1.5768, "grad_norm": 0.45681431889533997, "learning_rate": 0.0002, "epoch": 2.0936170212765957, "step": 1230}, {"loss": 1.5598, "grad_norm": 0.48914700746536255, "learning_rate": 0.0002, "epoch": 2.1106382978723404, "step": 1240}, {"loss": 1.6031, "grad_norm": 0.43265485763549805, "learning_rate": 0.0002, "epoch": 2.127659574468085, "step": 1250}, {"loss": 1.5812, "grad_norm": 0.4641207754611969, "learning_rate": 0.0002, "epoch": 2.1446808510638298, "step": 1260}, {"loss": 1.5686, "grad_norm": 0.4840783476829529, "learning_rate": 0.0002, "epoch": 2.1617021276595745, "step": 1270}, {"loss": 1.5969, "grad_norm": 0.4974595308303833, "learning_rate": 0.0002, "epoch": 2.178723404255319, "step": 1280}, {"loss": 1.5512, "grad_norm": 0.5133475661277771, "learning_rate": 0.0002, "epoch": 2.195744680851064, "step": 1290}, {"loss": 1.6467, "grad_norm": 0.5030052065849304, "learning_rate": 0.0002, "epoch": 2.2127659574468086, "step": 1300}, {"loss": 1.6845, "grad_norm": 0.46602481603622437, "learning_rate": 0.0002, "epoch": 2.2297872340425533, "step": 1310}, {"loss": 1.6132, "grad_norm": 0.43662378191947937, "learning_rate": 0.0002, "epoch": 2.246808510638298, "step": 1320}, {"loss": 1.6223, "grad_norm": 0.5137454867362976, "learning_rate": 0.0002, "epoch": 2.2638297872340427, "step": 1330}, {"loss": 1.5702, "grad_norm": 0.4750335216522217, "learning_rate": 0.0002, "epoch": 2.2808510638297874, "step": 1340}, {"loss": 1.6479, "grad_norm": 0.43691426515579224, "learning_rate": 0.0002, "epoch": 2.297872340425532, "step": 1350}, {"loss": 1.5561, "grad_norm": 0.49752047657966614, "learning_rate": 0.0002, "epoch": 2.3148936170212764, "step": 1360}, {"loss": 1.5789, "grad_norm": 0.45101815462112427, "learning_rate": 0.0002, "epoch": 2.331914893617021, "step": 1370}, {"loss": 1.5858, "grad_norm": 0.4427817761898041, "learning_rate": 0.0002, "epoch": 2.348936170212766, "step": 1380}, {"loss": 1.5537, "grad_norm": 0.4802311062812805, "learning_rate": 0.0002, "epoch": 2.3659574468085105, "step": 1390}, {"loss": 1.5846, "grad_norm": 0.4512513279914856, "learning_rate": 0.0002, "epoch": 2.382978723404255, "step": 1400}, {"loss": 1.627, "grad_norm": 0.4878857135772705, "learning_rate": 0.0002, "epoch": 2.4, "step": 1410}, {"loss": 1.5781, "grad_norm": 0.4741315543651581, "learning_rate": 0.0002, "epoch": 2.4170212765957446, "step": 1420}, {"loss": 1.5595, "grad_norm": 0.4770931601524353, "learning_rate": 0.0002, "epoch": 2.4340425531914893, "step": 1430}, {"loss": 1.5336, "grad_norm": 0.5124667286872864, "learning_rate": 0.0002, "epoch": 2.451063829787234, "step": 1440}, {"loss": 1.5811, "grad_norm": 0.45264801383018494, "learning_rate": 0.0002, "epoch": 2.4680851063829787, "step": 1450}, {"loss": 1.604, "grad_norm": 0.5456924438476562, "learning_rate": 0.0002, "epoch": 2.4851063829787234, "step": 1460}, {"loss": 1.6063, "grad_norm": 0.44656285643577576, "learning_rate": 0.0002, "epoch": 2.502127659574468, "step": 1470}, {"loss": 1.6336, "grad_norm": 0.5939419865608215, "learning_rate": 0.0002, "epoch": 2.519148936170213, "step": 1480}, {"loss": 1.5481, "grad_norm": 0.47853362560272217, "learning_rate": 0.0002, "epoch": 2.5361702127659576, "step": 1490}, {"loss": 1.6543, "grad_norm": 0.47643396258354187, "learning_rate": 0.0002, "epoch": 2.5531914893617023, "step": 1500}, {"loss": 1.5085, "grad_norm": 0.4939501881599426, "learning_rate": 0.0002, "epoch": 2.570212765957447, "step": 1510}, {"loss": 1.5659, "grad_norm": 0.502055287361145, "learning_rate": 0.0002, "epoch": 2.5872340425531917, "step": 1520}, {"loss": 1.6519, "grad_norm": 0.463250994682312, "learning_rate": 0.0002, "epoch": 2.604255319148936, "step": 1530}, {"loss": 1.5698, "grad_norm": 0.4761098623275757, "learning_rate": 0.0002, "epoch": 2.621276595744681, "step": 1540}, {"loss": 1.6044, "grad_norm": 0.4687299132347107, "learning_rate": 0.0002, "epoch": 2.6382978723404253, "step": 1550}, {"loss": 1.632, "grad_norm": 0.5536078810691833, "learning_rate": 0.0002, "epoch": 2.65531914893617, "step": 1560}, {"loss": 1.586, "grad_norm": 0.581320583820343, "learning_rate": 0.0002, "epoch": 2.6723404255319148, "step": 1570}, {"loss": 1.508, "grad_norm": 0.45952868461608887, "learning_rate": 0.0002, "epoch": 2.6893617021276595, "step": 1580}, {"loss": 1.5905, "grad_norm": 0.4602586328983307, "learning_rate": 0.0002, "epoch": 2.706382978723404, "step": 1590}, {"loss": 1.6008, "grad_norm": 0.5276554226875305, "learning_rate": 0.0002, "epoch": 2.723404255319149, "step": 1600}, {"loss": 1.7216, "grad_norm": 0.5750249624252319, "learning_rate": 0.0002, "epoch": 2.7404255319148936, "step": 1610}, {"loss": 1.5506, "grad_norm": 0.468723863363266, "learning_rate": 0.0002, "epoch": 2.7574468085106383, "step": 1620}, {"loss": 1.4499, "grad_norm": 0.44649943709373474, "learning_rate": 0.0002, "epoch": 2.774468085106383, "step": 1630}, {"loss": 1.5106, "grad_norm": 0.5097237825393677, "learning_rate": 0.0002, "epoch": 2.7914893617021277, "step": 1640}, {"loss": 1.5948, "grad_norm": 0.46384191513061523, "learning_rate": 0.0002, "epoch": 2.8085106382978724, "step": 1650}, {"loss": 1.5828, "grad_norm": 0.4885474443435669, "learning_rate": 0.0002, "epoch": 2.825531914893617, "step": 1660}, {"loss": 1.5675, "grad_norm": 0.45621681213378906, "learning_rate": 0.0002, "epoch": 2.842553191489362, "step": 1670}, {"loss": 1.6042, "grad_norm": 0.4797150194644928, "learning_rate": 0.0002, "epoch": 2.8595744680851065, "step": 1680}, {"loss": 1.5601, "grad_norm": 0.5142032504081726, "learning_rate": 0.0002, "epoch": 2.876595744680851, "step": 1690}, {"loss": 1.5984, "grad_norm": 0.48939862847328186, "learning_rate": 0.0002, "epoch": 2.8936170212765955, "step": 1700}, {"loss": 1.6333, "grad_norm": 0.4575578272342682, "learning_rate": 0.0002, "epoch": 2.9106382978723406, "step": 1710}, {"loss": 1.5396, "grad_norm": 0.5589063763618469, "learning_rate": 0.0002, "epoch": 2.927659574468085, "step": 1720}, {"loss": 1.6096, "grad_norm": 0.48508813977241516, "learning_rate": 0.0002, "epoch": 2.94468085106383, "step": 1730}, {"loss": 1.5686, "grad_norm": 0.42786726355552673, "learning_rate": 0.0002, "epoch": 2.9617021276595743, "step": 1740}, {"loss": 1.5555, "grad_norm": 0.5598229765892029, "learning_rate": 0.0002, "epoch": 2.978723404255319, "step": 1750}, {"loss": 1.5035, "grad_norm": 0.4779253602027893, "learning_rate": 0.0002, "epoch": 2.9957446808510637, "step": 1760}]} +{"epoch": 4.0, "step": 2350, "epoch_duration": 2044.753186225891, "total_accumulated_duration": 8204.751147985458, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5996, "grad_norm": 0.7596228122711182, "learning_rate": 0.0002, "epoch": 0.01702127659574468, "step": 10}, {"loss": 2.2941, "grad_norm": 0.4860903322696686, "learning_rate": 0.0002, "epoch": 0.03404255319148936, "step": 20}, {"loss": 2.0018, "grad_norm": 0.4953401982784271, "learning_rate": 0.0002, "epoch": 0.05106382978723404, "step": 30}, {"loss": 1.9318, "grad_norm": 0.5086901783943176, "learning_rate": 0.0002, "epoch": 0.06808510638297872, "step": 40}, {"loss": 1.8519, "grad_norm": 0.49050021171569824, "learning_rate": 0.0002, "epoch": 0.0851063829787234, "step": 50}, {"loss": 1.8786, "grad_norm": 0.4922358989715576, "learning_rate": 0.0002, "epoch": 0.10212765957446808, "step": 60}, {"loss": 1.8812, "grad_norm": 0.4621541202068329, "learning_rate": 0.0002, "epoch": 0.11914893617021277, "step": 70}, {"loss": 1.8004, "grad_norm": 0.4416729807853699, "learning_rate": 0.0002, "epoch": 0.13617021276595745, "step": 80}, {"loss": 1.9298, "grad_norm": 0.526258111000061, "learning_rate": 0.0002, "epoch": 0.15319148936170213, "step": 90}, {"loss": 1.8339, "grad_norm": 0.44022637605667114, "learning_rate": 0.0002, "epoch": 0.1702127659574468, "step": 100}, {"loss": 1.7681, "grad_norm": 0.4647711515426636, "learning_rate": 0.0002, "epoch": 0.18723404255319148, "step": 110}, {"loss": 1.6953, "grad_norm": 0.4136318564414978, "learning_rate": 0.0002, "epoch": 0.20425531914893616, "step": 120}, {"loss": 1.8491, "grad_norm": 0.39707672595977783, "learning_rate": 0.0002, "epoch": 0.22127659574468084, "step": 130}, {"loss": 1.8017, "grad_norm": 0.4478105306625366, "learning_rate": 0.0002, "epoch": 0.23829787234042554, "step": 140}, {"loss": 1.8067, "grad_norm": 0.4699741303920746, "learning_rate": 0.0002, "epoch": 0.2553191489361702, "step": 150}, {"loss": 1.8161, "grad_norm": 0.4568363130092621, "learning_rate": 0.0002, "epoch": 0.2723404255319149, "step": 160}, {"loss": 1.7491, "grad_norm": 0.45078757405281067, "learning_rate": 0.0002, "epoch": 0.28936170212765955, "step": 170}, {"loss": 1.8115, "grad_norm": 0.4127245843410492, "learning_rate": 0.0002, "epoch": 0.30638297872340425, "step": 180}, {"loss": 1.7707, "grad_norm": 0.4042493402957916, "learning_rate": 0.0002, "epoch": 0.32340425531914896, "step": 190}, {"loss": 1.7807, "grad_norm": 0.401487797498703, "learning_rate": 0.0002, "epoch": 0.3404255319148936, "step": 200}, {"loss": 1.8355, "grad_norm": 0.3959457576274872, "learning_rate": 0.0002, "epoch": 0.3574468085106383, "step": 210}, {"loss": 1.7798, "grad_norm": 0.39865636825561523, "learning_rate": 0.0002, "epoch": 0.37446808510638296, "step": 220}, {"loss": 1.9012, "grad_norm": 0.7225169539451599, "learning_rate": 0.0002, "epoch": 0.39148936170212767, "step": 230}, {"loss": 1.7212, "grad_norm": 0.412801593542099, "learning_rate": 0.0002, "epoch": 0.4085106382978723, "step": 240}, {"loss": 1.8523, "grad_norm": 0.40951448678970337, "learning_rate": 0.0002, "epoch": 0.425531914893617, "step": 250}, {"loss": 1.7283, "grad_norm": 0.42788130044937134, "learning_rate": 0.0002, "epoch": 0.4425531914893617, "step": 260}, {"loss": 1.8051, "grad_norm": 0.41069576144218445, "learning_rate": 0.0002, "epoch": 0.4595744680851064, "step": 270}, {"loss": 1.7861, "grad_norm": 0.3745323717594147, "learning_rate": 0.0002, "epoch": 0.4765957446808511, "step": 280}, {"loss": 1.8484, "grad_norm": 0.3771323263645172, "learning_rate": 0.0002, "epoch": 0.49361702127659574, "step": 290}, {"loss": 1.7417, "grad_norm": 0.34368929266929626, "learning_rate": 0.0002, "epoch": 0.5106382978723404, "step": 300}, {"loss": 1.7394, "grad_norm": 0.4299296736717224, "learning_rate": 0.0002, "epoch": 0.5276595744680851, "step": 310}, {"loss": 1.8255, "grad_norm": 0.4133922755718231, "learning_rate": 0.0002, "epoch": 0.5446808510638298, "step": 320}, {"loss": 1.7043, "grad_norm": 0.3984859585762024, "learning_rate": 0.0002, "epoch": 0.5617021276595745, "step": 330}, {"loss": 1.7873, "grad_norm": 0.3822788894176483, "learning_rate": 0.0002, "epoch": 0.5787234042553191, "step": 340}, {"loss": 1.8082, "grad_norm": 0.4550061821937561, "learning_rate": 0.0002, "epoch": 0.5957446808510638, "step": 350}, {"loss": 1.7179, "grad_norm": 0.36571192741394043, "learning_rate": 0.0002, "epoch": 0.6127659574468085, "step": 360}, {"loss": 1.8196, "grad_norm": 0.32942914962768555, "learning_rate": 0.0002, "epoch": 0.6297872340425532, "step": 370}, {"loss": 1.7118, "grad_norm": 0.39299526810646057, "learning_rate": 0.0002, "epoch": 0.6468085106382979, "step": 380}, {"loss": 1.8179, "grad_norm": 0.3817657232284546, "learning_rate": 0.0002, "epoch": 0.6638297872340425, "step": 390}, {"loss": 1.8174, "grad_norm": 0.3650810122489929, "learning_rate": 0.0002, "epoch": 0.6808510638297872, "step": 400}, {"loss": 1.8026, "grad_norm": 0.3736686408519745, "learning_rate": 0.0002, "epoch": 0.6978723404255319, "step": 410}, {"loss": 1.802, "grad_norm": 0.45680564641952515, "learning_rate": 0.0002, "epoch": 0.7148936170212766, "step": 420}, {"loss": 1.7844, "grad_norm": 0.4154510200023651, "learning_rate": 0.0002, "epoch": 0.7319148936170212, "step": 430}, {"loss": 1.7801, "grad_norm": 0.3701167106628418, "learning_rate": 0.0002, "epoch": 0.7489361702127659, "step": 440}, {"loss": 1.7689, "grad_norm": 0.3869531750679016, "learning_rate": 0.0002, "epoch": 0.7659574468085106, "step": 450}, {"loss": 1.7671, "grad_norm": 0.4391495883464813, "learning_rate": 0.0002, "epoch": 0.7829787234042553, "step": 460}, {"loss": 1.7534, "grad_norm": 0.39652755856513977, "learning_rate": 0.0002, "epoch": 0.8, "step": 470}, {"loss": 1.8076, "grad_norm": 0.4096752107143402, "learning_rate": 0.0002, "epoch": 0.8170212765957446, "step": 480}, {"loss": 1.796, "grad_norm": 0.3857504427433014, "learning_rate": 0.0002, "epoch": 0.8340425531914893, "step": 490}, {"loss": 1.7379, "grad_norm": 0.4105374217033386, "learning_rate": 0.0002, "epoch": 0.851063829787234, "step": 500}, {"loss": 1.6391, "grad_norm": 0.3723328113555908, "learning_rate": 0.0002, "epoch": 0.8680851063829788, "step": 510}, {"loss": 1.7122, "grad_norm": 0.36099690198898315, "learning_rate": 0.0002, "epoch": 0.8851063829787233, "step": 520}, {"loss": 1.7632, "grad_norm": 0.3715187907218933, "learning_rate": 0.0002, "epoch": 0.902127659574468, "step": 530}, {"loss": 1.7004, "grad_norm": 0.4932813048362732, "learning_rate": 0.0002, "epoch": 0.9191489361702128, "step": 540}, {"loss": 1.679, "grad_norm": 0.3493495285511017, "learning_rate": 0.0002, "epoch": 0.9361702127659575, "step": 550}, {"loss": 1.7758, "grad_norm": 0.3598061800003052, "learning_rate": 0.0002, "epoch": 0.9531914893617022, "step": 560}, {"loss": 1.7686, "grad_norm": 0.3521560728549957, "learning_rate": 0.0002, "epoch": 0.9702127659574468, "step": 570}, {"loss": 1.7391, "grad_norm": 0.34150034189224243, "learning_rate": 0.0002, "epoch": 0.9872340425531915, "step": 580}, {"eval_loss": 1.8388911485671997, "eval_runtime": 106.6788, "eval_samples_per_second": 4.828, "eval_steps_per_second": 0.609, "epoch": 0.9991489361702127, "step": 587}, {"loss": 1.6115, "grad_norm": 0.33556103706359863, "learning_rate": 0.0002, "epoch": 1.004255319148936, "step": 590}, {"loss": 1.8346, "grad_norm": 0.4333398640155792, "learning_rate": 0.0002, "epoch": 1.0212765957446808, "step": 600}, {"loss": 1.6656, "grad_norm": 0.38488736748695374, "learning_rate": 0.0002, "epoch": 1.0382978723404255, "step": 610}, {"loss": 1.7778, "grad_norm": 0.44454529881477356, "learning_rate": 0.0002, "epoch": 1.0553191489361702, "step": 620}, {"loss": 1.7143, "grad_norm": 0.3735603392124176, "learning_rate": 0.0002, "epoch": 1.0723404255319149, "step": 630}, {"loss": 1.74, "grad_norm": 0.38912704586982727, "learning_rate": 0.0002, "epoch": 1.0893617021276596, "step": 640}, {"loss": 1.607, "grad_norm": 0.4411826431751251, "learning_rate": 0.0002, "epoch": 1.1063829787234043, "step": 650}, {"loss": 1.6901, "grad_norm": 0.4163050353527069, "learning_rate": 0.0002, "epoch": 1.123404255319149, "step": 660}, {"loss": 1.6722, "grad_norm": 0.4187192916870117, "learning_rate": 0.0002, "epoch": 1.1404255319148937, "step": 670}, {"loss": 1.7061, "grad_norm": 0.3797093629837036, "learning_rate": 0.0002, "epoch": 1.1574468085106382, "step": 680}, {"loss": 1.6409, "grad_norm": 0.4210026264190674, "learning_rate": 0.0002, "epoch": 1.174468085106383, "step": 690}, {"loss": 1.6343, "grad_norm": 0.4701998829841614, "learning_rate": 0.0002, "epoch": 1.1914893617021276, "step": 700}, {"loss": 1.6773, "grad_norm": 0.6331578493118286, "learning_rate": 0.0002, "epoch": 1.2085106382978723, "step": 710}, {"loss": 1.6776, "grad_norm": 0.41908255219459534, "learning_rate": 0.0002, "epoch": 1.225531914893617, "step": 720}, {"loss": 1.6468, "grad_norm": 0.36158403754234314, "learning_rate": 0.0002, "epoch": 1.2425531914893617, "step": 730}, {"loss": 1.746, "grad_norm": 0.387300580739975, "learning_rate": 0.0002, "epoch": 1.2595744680851064, "step": 740}, {"loss": 1.7089, "grad_norm": 0.38899728655815125, "learning_rate": 0.0002, "epoch": 1.2765957446808511, "step": 750}, {"loss": 1.6376, "grad_norm": 0.4549255073070526, "learning_rate": 0.0002, "epoch": 1.2936170212765958, "step": 760}, {"loss": 1.6754, "grad_norm": 0.4052349328994751, "learning_rate": 0.0002, "epoch": 1.3106382978723405, "step": 770}, {"loss": 1.6483, "grad_norm": 0.38934215903282166, "learning_rate": 0.0002, "epoch": 1.327659574468085, "step": 780}, {"loss": 1.7813, "grad_norm": 0.38688382506370544, "learning_rate": 0.0002, "epoch": 1.3446808510638297, "step": 790}, {"loss": 1.6374, "grad_norm": 0.3825705051422119, "learning_rate": 0.0002, "epoch": 1.3617021276595744, "step": 800}, {"loss": 1.655, "grad_norm": 0.37331756949424744, "learning_rate": 0.0002, "epoch": 1.3787234042553191, "step": 810}, {"loss": 1.6321, "grad_norm": 0.38826408982276917, "learning_rate": 0.0002, "epoch": 1.3957446808510638, "step": 820}, {"loss": 1.6107, "grad_norm": 0.4213569164276123, "learning_rate": 0.0002, "epoch": 1.4127659574468086, "step": 830}, {"loss": 1.6792, "grad_norm": 0.3976684808731079, "learning_rate": 0.0002, "epoch": 1.4297872340425533, "step": 840}, {"loss": 1.7029, "grad_norm": 0.390009343624115, "learning_rate": 0.0002, "epoch": 1.4468085106382977, "step": 850}, {"loss": 1.7489, "grad_norm": 0.4462052583694458, "learning_rate": 0.0002, "epoch": 1.4638297872340424, "step": 860}, {"loss": 1.7513, "grad_norm": 0.42129236459732056, "learning_rate": 0.0002, "epoch": 1.4808510638297872, "step": 870}, {"loss": 1.6009, "grad_norm": 0.41489893198013306, "learning_rate": 0.0002, "epoch": 1.4978723404255319, "step": 880}, {"loss": 1.7129, "grad_norm": 0.41451677680015564, "learning_rate": 0.0002, "epoch": 1.5148936170212766, "step": 890}, {"loss": 1.6427, "grad_norm": 0.4477299749851227, "learning_rate": 0.0002, "epoch": 1.5319148936170213, "step": 900}, {"loss": 1.6543, "grad_norm": 0.38476648926734924, "learning_rate": 0.0002, "epoch": 1.548936170212766, "step": 910}, {"loss": 1.7103, "grad_norm": 0.42755743861198425, "learning_rate": 0.0002, "epoch": 1.5659574468085107, "step": 920}, {"loss": 1.7014, "grad_norm": 0.39372023940086365, "learning_rate": 0.0002, "epoch": 1.5829787234042554, "step": 930}, {"loss": 1.7042, "grad_norm": 0.42778754234313965, "learning_rate": 0.0002, "epoch": 1.6, "step": 940}, {"loss": 1.7488, "grad_norm": 0.4217268228530884, "learning_rate": 0.0002, "epoch": 1.6170212765957448, "step": 950}, {"loss": 1.6556, "grad_norm": 0.40452107787132263, "learning_rate": 0.0002, "epoch": 1.6340425531914895, "step": 960}, {"loss": 1.6734, "grad_norm": 0.4259980022907257, "learning_rate": 0.0002, "epoch": 1.6510638297872342, "step": 970}, {"loss": 1.6464, "grad_norm": 0.4089849591255188, "learning_rate": 0.0002, "epoch": 1.6680851063829787, "step": 980}, {"loss": 1.6588, "grad_norm": 0.38276049494743347, "learning_rate": 0.0002, "epoch": 1.6851063829787234, "step": 990}, {"loss": 1.6846, "grad_norm": 0.40361565351486206, "learning_rate": 0.0002, "epoch": 1.702127659574468, "step": 1000}, {"loss": 1.698, "grad_norm": 0.3537807762622833, "learning_rate": 0.0002, "epoch": 1.7191489361702128, "step": 1010}, {"loss": 1.6001, "grad_norm": 0.40288347005844116, "learning_rate": 0.0002, "epoch": 1.7361702127659573, "step": 1020}, {"loss": 1.6375, "grad_norm": 0.4003616273403168, "learning_rate": 0.0002, "epoch": 1.753191489361702, "step": 1030}, {"loss": 1.6067, "grad_norm": 0.3931669592857361, "learning_rate": 0.0002, "epoch": 1.7702127659574467, "step": 1040}, {"loss": 1.6814, "grad_norm": 0.4001635015010834, "learning_rate": 0.0002, "epoch": 1.7872340425531914, "step": 1050}, {"loss": 1.6158, "grad_norm": 0.4139048457145691, "learning_rate": 0.0002, "epoch": 1.804255319148936, "step": 1060}, {"loss": 1.7513, "grad_norm": 0.5044458508491516, "learning_rate": 0.0002, "epoch": 1.8212765957446808, "step": 1070}, {"loss": 1.7317, "grad_norm": 0.4827095568180084, "learning_rate": 0.0002, "epoch": 1.8382978723404255, "step": 1080}, {"loss": 1.6586, "grad_norm": 0.3750515282154083, "learning_rate": 0.0002, "epoch": 1.8553191489361702, "step": 1090}, {"loss": 1.653, "grad_norm": 0.4024597704410553, "learning_rate": 0.0002, "epoch": 1.872340425531915, "step": 1100}, {"loss": 1.7138, "grad_norm": 0.36747241020202637, "learning_rate": 0.0002, "epoch": 1.8893617021276596, "step": 1110}, {"loss": 1.652, "grad_norm": 0.41397711634635925, "learning_rate": 0.0002, "epoch": 1.9063829787234043, "step": 1120}, {"loss": 1.7071, "grad_norm": 0.3960763216018677, "learning_rate": 0.0002, "epoch": 1.923404255319149, "step": 1130}, {"loss": 1.6857, "grad_norm": 0.4533233344554901, "learning_rate": 0.0002, "epoch": 1.9404255319148938, "step": 1140}, {"loss": 1.7168, "grad_norm": 0.38433438539505005, "learning_rate": 0.0002, "epoch": 1.9574468085106385, "step": 1150}, {"loss": 1.7444, "grad_norm": 0.3648812174797058, "learning_rate": 0.0002, "epoch": 1.974468085106383, "step": 1160}, {"loss": 1.6521, "grad_norm": 0.3887176215648651, "learning_rate": 0.0002, "epoch": 1.9914893617021276, "step": 1170}, {"eval_loss": 1.8328146934509277, "eval_runtime": 107.2842, "eval_samples_per_second": 4.8, "eval_steps_per_second": 0.606, "epoch": 2.0, "step": 1175}, {"loss": 1.6184, "grad_norm": 0.40444880723953247, "learning_rate": 0.0002, "epoch": 2.008510638297872, "step": 1180}, {"loss": 1.5221, "grad_norm": 0.3997816741466522, "learning_rate": 0.0002, "epoch": 2.025531914893617, "step": 1190}, {"loss": 1.6022, "grad_norm": 0.4516718089580536, "learning_rate": 0.0002, "epoch": 2.0425531914893615, "step": 1200}, {"loss": 1.5458, "grad_norm": 0.6645553708076477, "learning_rate": 0.0002, "epoch": 2.0595744680851062, "step": 1210}, {"loss": 1.5985, "grad_norm": 0.4181990921497345, "learning_rate": 0.0002, "epoch": 2.076595744680851, "step": 1220}, {"loss": 1.5768, "grad_norm": 0.45681431889533997, "learning_rate": 0.0002, "epoch": 2.0936170212765957, "step": 1230}, {"loss": 1.5598, "grad_norm": 0.48914700746536255, "learning_rate": 0.0002, "epoch": 2.1106382978723404, "step": 1240}, {"loss": 1.6031, "grad_norm": 0.43265485763549805, "learning_rate": 0.0002, "epoch": 2.127659574468085, "step": 1250}, {"loss": 1.5812, "grad_norm": 0.4641207754611969, "learning_rate": 0.0002, "epoch": 2.1446808510638298, "step": 1260}, {"loss": 1.5686, "grad_norm": 0.4840783476829529, "learning_rate": 0.0002, "epoch": 2.1617021276595745, "step": 1270}, {"loss": 1.5969, "grad_norm": 0.4974595308303833, "learning_rate": 0.0002, "epoch": 2.178723404255319, "step": 1280}, {"loss": 1.5512, "grad_norm": 0.5133475661277771, "learning_rate": 0.0002, "epoch": 2.195744680851064, "step": 1290}, {"loss": 1.6467, "grad_norm": 0.5030052065849304, "learning_rate": 0.0002, "epoch": 2.2127659574468086, "step": 1300}, {"loss": 1.6845, "grad_norm": 0.46602481603622437, "learning_rate": 0.0002, "epoch": 2.2297872340425533, "step": 1310}, {"loss": 1.6132, "grad_norm": 0.43662378191947937, "learning_rate": 0.0002, "epoch": 2.246808510638298, "step": 1320}, {"loss": 1.6223, "grad_norm": 0.5137454867362976, "learning_rate": 0.0002, "epoch": 2.2638297872340427, "step": 1330}, {"loss": 1.5702, "grad_norm": 0.4750335216522217, "learning_rate": 0.0002, "epoch": 2.2808510638297874, "step": 1340}, {"loss": 1.6479, "grad_norm": 0.43691426515579224, "learning_rate": 0.0002, "epoch": 2.297872340425532, "step": 1350}, {"loss": 1.5561, "grad_norm": 0.49752047657966614, "learning_rate": 0.0002, "epoch": 2.3148936170212764, "step": 1360}, {"loss": 1.5789, "grad_norm": 0.45101815462112427, "learning_rate": 0.0002, "epoch": 2.331914893617021, "step": 1370}, {"loss": 1.5858, "grad_norm": 0.4427817761898041, "learning_rate": 0.0002, "epoch": 2.348936170212766, "step": 1380}, {"loss": 1.5537, "grad_norm": 0.4802311062812805, "learning_rate": 0.0002, "epoch": 2.3659574468085105, "step": 1390}, {"loss": 1.5846, "grad_norm": 0.4512513279914856, "learning_rate": 0.0002, "epoch": 2.382978723404255, "step": 1400}, {"loss": 1.627, "grad_norm": 0.4878857135772705, "learning_rate": 0.0002, "epoch": 2.4, "step": 1410}, {"loss": 1.5781, "grad_norm": 0.4741315543651581, "learning_rate": 0.0002, "epoch": 2.4170212765957446, "step": 1420}, {"loss": 1.5595, "grad_norm": 0.4770931601524353, "learning_rate": 0.0002, "epoch": 2.4340425531914893, "step": 1430}, {"loss": 1.5336, "grad_norm": 0.5124667286872864, "learning_rate": 0.0002, "epoch": 2.451063829787234, "step": 1440}, {"loss": 1.5811, "grad_norm": 0.45264801383018494, "learning_rate": 0.0002, "epoch": 2.4680851063829787, "step": 1450}, {"loss": 1.604, "grad_norm": 0.5456924438476562, "learning_rate": 0.0002, "epoch": 2.4851063829787234, "step": 1460}, {"loss": 1.6063, "grad_norm": 0.44656285643577576, "learning_rate": 0.0002, "epoch": 2.502127659574468, "step": 1470}, {"loss": 1.6336, "grad_norm": 0.5939419865608215, "learning_rate": 0.0002, "epoch": 2.519148936170213, "step": 1480}, {"loss": 1.5481, "grad_norm": 0.47853362560272217, "learning_rate": 0.0002, "epoch": 2.5361702127659576, "step": 1490}, {"loss": 1.6543, "grad_norm": 0.47643396258354187, "learning_rate": 0.0002, "epoch": 2.5531914893617023, "step": 1500}, {"loss": 1.5085, "grad_norm": 0.4939501881599426, "learning_rate": 0.0002, "epoch": 2.570212765957447, "step": 1510}, {"loss": 1.5659, "grad_norm": 0.502055287361145, "learning_rate": 0.0002, "epoch": 2.5872340425531917, "step": 1520}, {"loss": 1.6519, "grad_norm": 0.463250994682312, "learning_rate": 0.0002, "epoch": 2.604255319148936, "step": 1530}, {"loss": 1.5698, "grad_norm": 0.4761098623275757, "learning_rate": 0.0002, "epoch": 2.621276595744681, "step": 1540}, {"loss": 1.6044, "grad_norm": 0.4687299132347107, "learning_rate": 0.0002, "epoch": 2.6382978723404253, "step": 1550}, {"loss": 1.632, "grad_norm": 0.5536078810691833, "learning_rate": 0.0002, "epoch": 2.65531914893617, "step": 1560}, {"loss": 1.586, "grad_norm": 0.581320583820343, "learning_rate": 0.0002, "epoch": 2.6723404255319148, "step": 1570}, {"loss": 1.508, "grad_norm": 0.45952868461608887, "learning_rate": 0.0002, "epoch": 2.6893617021276595, "step": 1580}, {"loss": 1.5905, "grad_norm": 0.4602586328983307, "learning_rate": 0.0002, "epoch": 2.706382978723404, "step": 1590}, {"loss": 1.6008, "grad_norm": 0.5276554226875305, "learning_rate": 0.0002, "epoch": 2.723404255319149, "step": 1600}, {"loss": 1.7216, "grad_norm": 0.5750249624252319, "learning_rate": 0.0002, "epoch": 2.7404255319148936, "step": 1610}, {"loss": 1.5506, "grad_norm": 0.468723863363266, "learning_rate": 0.0002, "epoch": 2.7574468085106383, "step": 1620}, {"loss": 1.4499, "grad_norm": 0.44649943709373474, "learning_rate": 0.0002, "epoch": 2.774468085106383, "step": 1630}, {"loss": 1.5106, "grad_norm": 0.5097237825393677, "learning_rate": 0.0002, "epoch": 2.7914893617021277, "step": 1640}, {"loss": 1.5948, "grad_norm": 0.46384191513061523, "learning_rate": 0.0002, "epoch": 2.8085106382978724, "step": 1650}, {"loss": 1.5828, "grad_norm": 0.4885474443435669, "learning_rate": 0.0002, "epoch": 2.825531914893617, "step": 1660}, {"loss": 1.5675, "grad_norm": 0.45621681213378906, "learning_rate": 0.0002, "epoch": 2.842553191489362, "step": 1670}, {"loss": 1.6042, "grad_norm": 0.4797150194644928, "learning_rate": 0.0002, "epoch": 2.8595744680851065, "step": 1680}, {"loss": 1.5601, "grad_norm": 0.5142032504081726, "learning_rate": 0.0002, "epoch": 2.876595744680851, "step": 1690}, {"loss": 1.5984, "grad_norm": 0.48939862847328186, "learning_rate": 0.0002, "epoch": 2.8936170212765955, "step": 1700}, {"loss": 1.6333, "grad_norm": 0.4575578272342682, "learning_rate": 0.0002, "epoch": 2.9106382978723406, "step": 1710}, {"loss": 1.5396, "grad_norm": 0.5589063763618469, "learning_rate": 0.0002, "epoch": 2.927659574468085, "step": 1720}, {"loss": 1.6096, "grad_norm": 0.48508813977241516, "learning_rate": 0.0002, "epoch": 2.94468085106383, "step": 1730}, {"loss": 1.5686, "grad_norm": 0.42786726355552673, "learning_rate": 0.0002, "epoch": 2.9617021276595743, "step": 1740}, {"loss": 1.5555, "grad_norm": 0.5598229765892029, "learning_rate": 0.0002, "epoch": 2.978723404255319, "step": 1750}, {"loss": 1.5035, "grad_norm": 0.4779253602027893, "learning_rate": 0.0002, "epoch": 2.9957446808510637, "step": 1760}, {"eval_loss": 1.8543579578399658, "eval_runtime": 107.2363, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.606, "epoch": 2.999148936170213, "step": 1762}, {"loss": 1.4767, "grad_norm": 0.48810940980911255, "learning_rate": 0.0002, "epoch": 3.0127659574468084, "step": 1770}, {"loss": 1.5385, "grad_norm": 0.6194920539855957, "learning_rate": 0.0002, "epoch": 3.029787234042553, "step": 1780}, {"loss": 1.4012, "grad_norm": 0.5875462293624878, "learning_rate": 0.0002, "epoch": 3.046808510638298, "step": 1790}, {"loss": 1.4727, "grad_norm": 0.5775138139724731, "learning_rate": 0.0002, "epoch": 3.0638297872340425, "step": 1800}, {"loss": 1.493, "grad_norm": 0.5445981621742249, "learning_rate": 0.0002, "epoch": 3.0808510638297872, "step": 1810}, {"loss": 1.4247, "grad_norm": 0.6728862524032593, "learning_rate": 0.0002, "epoch": 3.097872340425532, "step": 1820}, {"loss": 1.4303, "grad_norm": 0.6105490326881409, "learning_rate": 0.0002, "epoch": 3.1148936170212767, "step": 1830}, {"loss": 1.5214, "grad_norm": 0.5771165490150452, "learning_rate": 0.0002, "epoch": 3.1319148936170214, "step": 1840}, {"loss": 1.4359, "grad_norm": 0.5778449773788452, "learning_rate": 0.0002, "epoch": 3.148936170212766, "step": 1850}, {"loss": 1.4121, "grad_norm": 0.7141990661621094, "learning_rate": 0.0002, "epoch": 3.1659574468085108, "step": 1860}, {"loss": 1.4904, "grad_norm": 0.5882705450057983, "learning_rate": 0.0002, "epoch": 3.1829787234042555, "step": 1870}, {"loss": 1.4941, "grad_norm": 0.5996195077896118, "learning_rate": 0.0002, "epoch": 3.2, "step": 1880}, {"loss": 1.4519, "grad_norm": 0.6121219396591187, "learning_rate": 0.0002, "epoch": 3.217021276595745, "step": 1890}, {"loss": 1.4586, "grad_norm": 0.6402981281280518, "learning_rate": 0.0002, "epoch": 3.2340425531914896, "step": 1900}, {"loss": 1.3766, "grad_norm": 0.6111783981323242, "learning_rate": 0.0002, "epoch": 3.251063829787234, "step": 1910}, {"loss": 1.4863, "grad_norm": 0.6682435274124146, "learning_rate": 0.0002, "epoch": 3.2680851063829786, "step": 1920}, {"loss": 1.4608, "grad_norm": 0.6530760526657104, "learning_rate": 0.0002, "epoch": 3.2851063829787233, "step": 1930}, {"loss": 1.4422, "grad_norm": 0.6481217741966248, "learning_rate": 0.0002, "epoch": 3.302127659574468, "step": 1940}, {"loss": 1.5158, "grad_norm": 0.6270697116851807, "learning_rate": 0.0002, "epoch": 3.3191489361702127, "step": 1950}, {"loss": 1.4116, "grad_norm": 0.5924492478370667, "learning_rate": 0.0002, "epoch": 3.3361702127659574, "step": 1960}, {"loss": 1.4578, "grad_norm": 0.5803806781768799, "learning_rate": 0.0002, "epoch": 3.353191489361702, "step": 1970}, {"loss": 1.4689, "grad_norm": 0.5754119157791138, "learning_rate": 0.0002, "epoch": 3.370212765957447, "step": 1980}, {"loss": 1.4605, "grad_norm": 0.6717178821563721, "learning_rate": 0.0002, "epoch": 3.3872340425531915, "step": 1990}, {"loss": 1.486, "grad_norm": 0.5955582857131958, "learning_rate": 0.0002, "epoch": 3.404255319148936, "step": 2000}, {"loss": 1.4445, "grad_norm": 0.6965329647064209, "learning_rate": 0.0002, "epoch": 3.421276595744681, "step": 2010}, {"loss": 1.4543, "grad_norm": 0.6321573257446289, "learning_rate": 0.0002, "epoch": 3.4382978723404256, "step": 2020}, {"loss": 1.5383, "grad_norm": 0.5952608585357666, "learning_rate": 0.0002, "epoch": 3.4553191489361703, "step": 2030}, {"loss": 1.4531, "grad_norm": 0.7718905806541443, "learning_rate": 0.0002, "epoch": 3.472340425531915, "step": 2040}, {"loss": 1.4678, "grad_norm": 0.6850892305374146, "learning_rate": 0.0002, "epoch": 3.4893617021276597, "step": 2050}, {"loss": 1.4956, "grad_norm": 0.5638895630836487, "learning_rate": 0.0002, "epoch": 3.506382978723404, "step": 2060}, {"loss": 1.4586, "grad_norm": 0.6148294806480408, "learning_rate": 0.0002, "epoch": 3.523404255319149, "step": 2070}, {"loss": 1.4622, "grad_norm": 0.5895810723304749, "learning_rate": 0.0002, "epoch": 3.5404255319148934, "step": 2080}, {"loss": 1.4341, "grad_norm": 0.6377319693565369, "learning_rate": 0.0002, "epoch": 3.5574468085106385, "step": 2090}, {"loss": 1.5056, "grad_norm": 0.6047691702842712, "learning_rate": 0.0002, "epoch": 3.574468085106383, "step": 2100}, {"loss": 1.4748, "grad_norm": 0.6049593687057495, "learning_rate": 0.0002, "epoch": 3.5914893617021275, "step": 2110}, {"loss": 1.391, "grad_norm": 0.6358312368392944, "learning_rate": 0.0002, "epoch": 3.608510638297872, "step": 2120}, {"loss": 1.4419, "grad_norm": 0.612119197845459, "learning_rate": 0.0002, "epoch": 3.625531914893617, "step": 2130}, {"loss": 1.438, "grad_norm": 0.6788054704666138, "learning_rate": 0.0002, "epoch": 3.6425531914893616, "step": 2140}, {"loss": 1.4295, "grad_norm": 0.6191043853759766, "learning_rate": 0.0002, "epoch": 3.6595744680851063, "step": 2150}, {"loss": 1.4383, "grad_norm": 0.6660051941871643, "learning_rate": 0.0002, "epoch": 3.676595744680851, "step": 2160}, {"loss": 1.4954, "grad_norm": 0.652692973613739, "learning_rate": 0.0002, "epoch": 3.6936170212765957, "step": 2170}, {"loss": 1.5245, "grad_norm": 0.6123467087745667, "learning_rate": 0.0002, "epoch": 3.7106382978723405, "step": 2180}, {"loss": 1.4686, "grad_norm": 0.640021562576294, "learning_rate": 0.0002, "epoch": 3.727659574468085, "step": 2190}, {"loss": 1.4277, "grad_norm": 0.6809179782867432, "learning_rate": 0.0002, "epoch": 3.74468085106383, "step": 2200}, {"loss": 1.4705, "grad_norm": 0.5978420376777649, "learning_rate": 0.0002, "epoch": 3.7617021276595746, "step": 2210}, {"loss": 1.5559, "grad_norm": 0.7038803100585938, "learning_rate": 0.0002, "epoch": 3.7787234042553193, "step": 2220}, {"loss": 1.4691, "grad_norm": 0.5324276089668274, "learning_rate": 0.0002, "epoch": 3.795744680851064, "step": 2230}, {"loss": 1.4696, "grad_norm": 0.6264132857322693, "learning_rate": 0.0002, "epoch": 3.8127659574468087, "step": 2240}, {"loss": 1.4856, "grad_norm": 0.6143888831138611, "learning_rate": 0.0002, "epoch": 3.829787234042553, "step": 2250}, {"loss": 1.535, "grad_norm": 0.6338503360748291, "learning_rate": 0.0002, "epoch": 3.846808510638298, "step": 2260}, {"loss": 1.456, "grad_norm": 0.556882381439209, "learning_rate": 0.0002, "epoch": 3.8638297872340424, "step": 2270}, {"loss": 1.4701, "grad_norm": 0.6323680281639099, "learning_rate": 0.0002, "epoch": 3.8808510638297875, "step": 2280}, {"loss": 1.5333, "grad_norm": 0.7105869054794312, "learning_rate": 0.0002, "epoch": 3.8978723404255318, "step": 2290}, {"loss": 1.4462, "grad_norm": 0.825415849685669, "learning_rate": 0.0002, "epoch": 3.9148936170212765, "step": 2300}, {"loss": 1.5023, "grad_norm": 0.6412091851234436, "learning_rate": 0.0002, "epoch": 3.931914893617021, "step": 2310}, {"loss": 1.3709, "grad_norm": 0.6286490559577942, "learning_rate": 0.0002, "epoch": 3.948936170212766, "step": 2320}, {"loss": 1.4693, "grad_norm": 0.636021077632904, "learning_rate": 0.0002, "epoch": 3.9659574468085106, "step": 2330}, {"loss": 1.4265, "grad_norm": 0.6032362580299377, "learning_rate": 0.0002, "epoch": 3.9829787234042553, "step": 2340}, {"loss": 1.377, "grad_norm": 0.6497282385826111, "learning_rate": 0.0002, "epoch": 4.0, "step": 2350}]} +{"epoch": 4.999148936170212, "step": 2937, "epoch_duration": 2053.5143167972565, "total_accumulated_duration": 10258.265464782715, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5996, "grad_norm": 0.7596228122711182, "learning_rate": 0.0002, "epoch": 0.01702127659574468, "step": 10}, {"loss": 2.2941, "grad_norm": 0.4860903322696686, "learning_rate": 0.0002, "epoch": 0.03404255319148936, "step": 20}, {"loss": 2.0018, "grad_norm": 0.4953401982784271, "learning_rate": 0.0002, "epoch": 0.05106382978723404, "step": 30}, {"loss": 1.9318, "grad_norm": 0.5086901783943176, "learning_rate": 0.0002, "epoch": 0.06808510638297872, "step": 40}, {"loss": 1.8519, "grad_norm": 0.49050021171569824, "learning_rate": 0.0002, "epoch": 0.0851063829787234, "step": 50}, {"loss": 1.8786, "grad_norm": 0.4922358989715576, "learning_rate": 0.0002, "epoch": 0.10212765957446808, "step": 60}, {"loss": 1.8812, "grad_norm": 0.4621541202068329, "learning_rate": 0.0002, "epoch": 0.11914893617021277, "step": 70}, {"loss": 1.8004, "grad_norm": 0.4416729807853699, "learning_rate": 0.0002, "epoch": 0.13617021276595745, "step": 80}, {"loss": 1.9298, "grad_norm": 0.526258111000061, "learning_rate": 0.0002, "epoch": 0.15319148936170213, "step": 90}, {"loss": 1.8339, "grad_norm": 0.44022637605667114, "learning_rate": 0.0002, "epoch": 0.1702127659574468, "step": 100}, {"loss": 1.7681, "grad_norm": 0.4647711515426636, "learning_rate": 0.0002, "epoch": 0.18723404255319148, "step": 110}, {"loss": 1.6953, "grad_norm": 0.4136318564414978, "learning_rate": 0.0002, "epoch": 0.20425531914893616, "step": 120}, {"loss": 1.8491, "grad_norm": 0.39707672595977783, "learning_rate": 0.0002, "epoch": 0.22127659574468084, "step": 130}, {"loss": 1.8017, "grad_norm": 0.4478105306625366, "learning_rate": 0.0002, "epoch": 0.23829787234042554, "step": 140}, {"loss": 1.8067, "grad_norm": 0.4699741303920746, "learning_rate": 0.0002, "epoch": 0.2553191489361702, "step": 150}, {"loss": 1.8161, "grad_norm": 0.4568363130092621, "learning_rate": 0.0002, "epoch": 0.2723404255319149, "step": 160}, {"loss": 1.7491, "grad_norm": 0.45078757405281067, "learning_rate": 0.0002, "epoch": 0.28936170212765955, "step": 170}, {"loss": 1.8115, "grad_norm": 0.4127245843410492, "learning_rate": 0.0002, "epoch": 0.30638297872340425, "step": 180}, {"loss": 1.7707, "grad_norm": 0.4042493402957916, "learning_rate": 0.0002, "epoch": 0.32340425531914896, "step": 190}, {"loss": 1.7807, "grad_norm": 0.401487797498703, "learning_rate": 0.0002, "epoch": 0.3404255319148936, "step": 200}, {"loss": 1.8355, "grad_norm": 0.3959457576274872, "learning_rate": 0.0002, "epoch": 0.3574468085106383, "step": 210}, {"loss": 1.7798, "grad_norm": 0.39865636825561523, "learning_rate": 0.0002, "epoch": 0.37446808510638296, "step": 220}, {"loss": 1.9012, "grad_norm": 0.7225169539451599, "learning_rate": 0.0002, "epoch": 0.39148936170212767, "step": 230}, {"loss": 1.7212, "grad_norm": 0.412801593542099, "learning_rate": 0.0002, "epoch": 0.4085106382978723, "step": 240}, {"loss": 1.8523, "grad_norm": 0.40951448678970337, "learning_rate": 0.0002, "epoch": 0.425531914893617, "step": 250}, {"loss": 1.7283, "grad_norm": 0.42788130044937134, "learning_rate": 0.0002, "epoch": 0.4425531914893617, "step": 260}, {"loss": 1.8051, "grad_norm": 0.41069576144218445, "learning_rate": 0.0002, "epoch": 0.4595744680851064, "step": 270}, {"loss": 1.7861, "grad_norm": 0.3745323717594147, "learning_rate": 0.0002, "epoch": 0.4765957446808511, "step": 280}, {"loss": 1.8484, "grad_norm": 0.3771323263645172, "learning_rate": 0.0002, "epoch": 0.49361702127659574, "step": 290}, {"loss": 1.7417, "grad_norm": 0.34368929266929626, "learning_rate": 0.0002, "epoch": 0.5106382978723404, "step": 300}, {"loss": 1.7394, "grad_norm": 0.4299296736717224, "learning_rate": 0.0002, "epoch": 0.5276595744680851, "step": 310}, {"loss": 1.8255, "grad_norm": 0.4133922755718231, "learning_rate": 0.0002, "epoch": 0.5446808510638298, "step": 320}, {"loss": 1.7043, "grad_norm": 0.3984859585762024, "learning_rate": 0.0002, "epoch": 0.5617021276595745, "step": 330}, {"loss": 1.7873, "grad_norm": 0.3822788894176483, "learning_rate": 0.0002, "epoch": 0.5787234042553191, "step": 340}, {"loss": 1.8082, "grad_norm": 0.4550061821937561, "learning_rate": 0.0002, "epoch": 0.5957446808510638, "step": 350}, {"loss": 1.7179, "grad_norm": 0.36571192741394043, "learning_rate": 0.0002, "epoch": 0.6127659574468085, "step": 360}, {"loss": 1.8196, "grad_norm": 0.32942914962768555, "learning_rate": 0.0002, "epoch": 0.6297872340425532, "step": 370}, {"loss": 1.7118, "grad_norm": 0.39299526810646057, "learning_rate": 0.0002, "epoch": 0.6468085106382979, "step": 380}, {"loss": 1.8179, "grad_norm": 0.3817657232284546, "learning_rate": 0.0002, "epoch": 0.6638297872340425, "step": 390}, {"loss": 1.8174, "grad_norm": 0.3650810122489929, "learning_rate": 0.0002, "epoch": 0.6808510638297872, "step": 400}, {"loss": 1.8026, "grad_norm": 0.3736686408519745, "learning_rate": 0.0002, "epoch": 0.6978723404255319, "step": 410}, {"loss": 1.802, "grad_norm": 0.45680564641952515, "learning_rate": 0.0002, "epoch": 0.7148936170212766, "step": 420}, {"loss": 1.7844, "grad_norm": 0.4154510200023651, "learning_rate": 0.0002, "epoch": 0.7319148936170212, "step": 430}, {"loss": 1.7801, "grad_norm": 0.3701167106628418, "learning_rate": 0.0002, "epoch": 0.7489361702127659, "step": 440}, {"loss": 1.7689, "grad_norm": 0.3869531750679016, "learning_rate": 0.0002, "epoch": 0.7659574468085106, "step": 450}, {"loss": 1.7671, "grad_norm": 0.4391495883464813, "learning_rate": 0.0002, "epoch": 0.7829787234042553, "step": 460}, {"loss": 1.7534, "grad_norm": 0.39652755856513977, "learning_rate": 0.0002, "epoch": 0.8, "step": 470}, {"loss": 1.8076, "grad_norm": 0.4096752107143402, "learning_rate": 0.0002, "epoch": 0.8170212765957446, "step": 480}, {"loss": 1.796, "grad_norm": 0.3857504427433014, "learning_rate": 0.0002, "epoch": 0.8340425531914893, "step": 490}, {"loss": 1.7379, "grad_norm": 0.4105374217033386, "learning_rate": 0.0002, "epoch": 0.851063829787234, "step": 500}, {"loss": 1.6391, "grad_norm": 0.3723328113555908, "learning_rate": 0.0002, "epoch": 0.8680851063829788, "step": 510}, {"loss": 1.7122, "grad_norm": 0.36099690198898315, "learning_rate": 0.0002, "epoch": 0.8851063829787233, "step": 520}, {"loss": 1.7632, "grad_norm": 0.3715187907218933, "learning_rate": 0.0002, "epoch": 0.902127659574468, "step": 530}, {"loss": 1.7004, "grad_norm": 0.4932813048362732, "learning_rate": 0.0002, "epoch": 0.9191489361702128, "step": 540}, {"loss": 1.679, "grad_norm": 0.3493495285511017, "learning_rate": 0.0002, "epoch": 0.9361702127659575, "step": 550}, {"loss": 1.7758, "grad_norm": 0.3598061800003052, "learning_rate": 0.0002, "epoch": 0.9531914893617022, "step": 560}, {"loss": 1.7686, "grad_norm": 0.3521560728549957, "learning_rate": 0.0002, "epoch": 0.9702127659574468, "step": 570}, {"loss": 1.7391, "grad_norm": 0.34150034189224243, "learning_rate": 0.0002, "epoch": 0.9872340425531915, "step": 580}, {"eval_loss": 1.8388911485671997, "eval_runtime": 106.6788, "eval_samples_per_second": 4.828, "eval_steps_per_second": 0.609, "epoch": 0.9991489361702127, "step": 587}, {"loss": 1.6115, "grad_norm": 0.33556103706359863, "learning_rate": 0.0002, "epoch": 1.004255319148936, "step": 590}, {"loss": 1.8346, "grad_norm": 0.4333398640155792, "learning_rate": 0.0002, "epoch": 1.0212765957446808, "step": 600}, {"loss": 1.6656, "grad_norm": 0.38488736748695374, "learning_rate": 0.0002, "epoch": 1.0382978723404255, "step": 610}, {"loss": 1.7778, "grad_norm": 0.44454529881477356, "learning_rate": 0.0002, "epoch": 1.0553191489361702, "step": 620}, {"loss": 1.7143, "grad_norm": 0.3735603392124176, "learning_rate": 0.0002, "epoch": 1.0723404255319149, "step": 630}, {"loss": 1.74, "grad_norm": 0.38912704586982727, "learning_rate": 0.0002, "epoch": 1.0893617021276596, "step": 640}, {"loss": 1.607, "grad_norm": 0.4411826431751251, "learning_rate": 0.0002, "epoch": 1.1063829787234043, "step": 650}, {"loss": 1.6901, "grad_norm": 0.4163050353527069, "learning_rate": 0.0002, "epoch": 1.123404255319149, "step": 660}, {"loss": 1.6722, "grad_norm": 0.4187192916870117, "learning_rate": 0.0002, "epoch": 1.1404255319148937, "step": 670}, {"loss": 1.7061, "grad_norm": 0.3797093629837036, "learning_rate": 0.0002, "epoch": 1.1574468085106382, "step": 680}, {"loss": 1.6409, "grad_norm": 0.4210026264190674, "learning_rate": 0.0002, "epoch": 1.174468085106383, "step": 690}, {"loss": 1.6343, "grad_norm": 0.4701998829841614, "learning_rate": 0.0002, "epoch": 1.1914893617021276, "step": 700}, {"loss": 1.6773, "grad_norm": 0.6331578493118286, "learning_rate": 0.0002, "epoch": 1.2085106382978723, "step": 710}, {"loss": 1.6776, "grad_norm": 0.41908255219459534, "learning_rate": 0.0002, "epoch": 1.225531914893617, "step": 720}, {"loss": 1.6468, "grad_norm": 0.36158403754234314, "learning_rate": 0.0002, "epoch": 1.2425531914893617, "step": 730}, {"loss": 1.746, "grad_norm": 0.387300580739975, "learning_rate": 0.0002, "epoch": 1.2595744680851064, "step": 740}, {"loss": 1.7089, "grad_norm": 0.38899728655815125, "learning_rate": 0.0002, "epoch": 1.2765957446808511, "step": 750}, {"loss": 1.6376, "grad_norm": 0.4549255073070526, "learning_rate": 0.0002, "epoch": 1.2936170212765958, "step": 760}, {"loss": 1.6754, "grad_norm": 0.4052349328994751, "learning_rate": 0.0002, "epoch": 1.3106382978723405, "step": 770}, {"loss": 1.6483, "grad_norm": 0.38934215903282166, "learning_rate": 0.0002, "epoch": 1.327659574468085, "step": 780}, {"loss": 1.7813, "grad_norm": 0.38688382506370544, "learning_rate": 0.0002, "epoch": 1.3446808510638297, "step": 790}, {"loss": 1.6374, "grad_norm": 0.3825705051422119, "learning_rate": 0.0002, "epoch": 1.3617021276595744, "step": 800}, {"loss": 1.655, "grad_norm": 0.37331756949424744, "learning_rate": 0.0002, "epoch": 1.3787234042553191, "step": 810}, {"loss": 1.6321, "grad_norm": 0.38826408982276917, "learning_rate": 0.0002, "epoch": 1.3957446808510638, "step": 820}, {"loss": 1.6107, "grad_norm": 0.4213569164276123, "learning_rate": 0.0002, "epoch": 1.4127659574468086, "step": 830}, {"loss": 1.6792, "grad_norm": 0.3976684808731079, "learning_rate": 0.0002, "epoch": 1.4297872340425533, "step": 840}, {"loss": 1.7029, "grad_norm": 0.390009343624115, "learning_rate": 0.0002, "epoch": 1.4468085106382977, "step": 850}, {"loss": 1.7489, "grad_norm": 0.4462052583694458, "learning_rate": 0.0002, "epoch": 1.4638297872340424, "step": 860}, {"loss": 1.7513, "grad_norm": 0.42129236459732056, "learning_rate": 0.0002, "epoch": 1.4808510638297872, "step": 870}, {"loss": 1.6009, "grad_norm": 0.41489893198013306, "learning_rate": 0.0002, "epoch": 1.4978723404255319, "step": 880}, {"loss": 1.7129, "grad_norm": 0.41451677680015564, "learning_rate": 0.0002, "epoch": 1.5148936170212766, "step": 890}, {"loss": 1.6427, "grad_norm": 0.4477299749851227, "learning_rate": 0.0002, "epoch": 1.5319148936170213, "step": 900}, {"loss": 1.6543, "grad_norm": 0.38476648926734924, "learning_rate": 0.0002, "epoch": 1.548936170212766, "step": 910}, {"loss": 1.7103, "grad_norm": 0.42755743861198425, "learning_rate": 0.0002, "epoch": 1.5659574468085107, "step": 920}, {"loss": 1.7014, "grad_norm": 0.39372023940086365, "learning_rate": 0.0002, "epoch": 1.5829787234042554, "step": 930}, {"loss": 1.7042, "grad_norm": 0.42778754234313965, "learning_rate": 0.0002, "epoch": 1.6, "step": 940}, {"loss": 1.7488, "grad_norm": 0.4217268228530884, "learning_rate": 0.0002, "epoch": 1.6170212765957448, "step": 950}, {"loss": 1.6556, "grad_norm": 0.40452107787132263, "learning_rate": 0.0002, "epoch": 1.6340425531914895, "step": 960}, {"loss": 1.6734, "grad_norm": 0.4259980022907257, "learning_rate": 0.0002, "epoch": 1.6510638297872342, "step": 970}, {"loss": 1.6464, "grad_norm": 0.4089849591255188, "learning_rate": 0.0002, "epoch": 1.6680851063829787, "step": 980}, {"loss": 1.6588, "grad_norm": 0.38276049494743347, "learning_rate": 0.0002, "epoch": 1.6851063829787234, "step": 990}, {"loss": 1.6846, "grad_norm": 0.40361565351486206, "learning_rate": 0.0002, "epoch": 1.702127659574468, "step": 1000}, {"loss": 1.698, "grad_norm": 0.3537807762622833, "learning_rate": 0.0002, "epoch": 1.7191489361702128, "step": 1010}, {"loss": 1.6001, "grad_norm": 0.40288347005844116, "learning_rate": 0.0002, "epoch": 1.7361702127659573, "step": 1020}, {"loss": 1.6375, "grad_norm": 0.4003616273403168, "learning_rate": 0.0002, "epoch": 1.753191489361702, "step": 1030}, {"loss": 1.6067, "grad_norm": 0.3931669592857361, "learning_rate": 0.0002, "epoch": 1.7702127659574467, "step": 1040}, {"loss": 1.6814, "grad_norm": 0.4001635015010834, "learning_rate": 0.0002, "epoch": 1.7872340425531914, "step": 1050}, {"loss": 1.6158, "grad_norm": 0.4139048457145691, "learning_rate": 0.0002, "epoch": 1.804255319148936, "step": 1060}, {"loss": 1.7513, "grad_norm": 0.5044458508491516, "learning_rate": 0.0002, "epoch": 1.8212765957446808, "step": 1070}, {"loss": 1.7317, "grad_norm": 0.4827095568180084, "learning_rate": 0.0002, "epoch": 1.8382978723404255, "step": 1080}, {"loss": 1.6586, "grad_norm": 0.3750515282154083, "learning_rate": 0.0002, "epoch": 1.8553191489361702, "step": 1090}, {"loss": 1.653, "grad_norm": 0.4024597704410553, "learning_rate": 0.0002, "epoch": 1.872340425531915, "step": 1100}, {"loss": 1.7138, "grad_norm": 0.36747241020202637, "learning_rate": 0.0002, "epoch": 1.8893617021276596, "step": 1110}, {"loss": 1.652, "grad_norm": 0.41397711634635925, "learning_rate": 0.0002, "epoch": 1.9063829787234043, "step": 1120}, {"loss": 1.7071, "grad_norm": 0.3960763216018677, "learning_rate": 0.0002, "epoch": 1.923404255319149, "step": 1130}, {"loss": 1.6857, "grad_norm": 0.4533233344554901, "learning_rate": 0.0002, "epoch": 1.9404255319148938, "step": 1140}, {"loss": 1.7168, "grad_norm": 0.38433438539505005, "learning_rate": 0.0002, "epoch": 1.9574468085106385, "step": 1150}, {"loss": 1.7444, "grad_norm": 0.3648812174797058, "learning_rate": 0.0002, "epoch": 1.974468085106383, "step": 1160}, {"loss": 1.6521, "grad_norm": 0.3887176215648651, "learning_rate": 0.0002, "epoch": 1.9914893617021276, "step": 1170}, {"eval_loss": 1.8328146934509277, "eval_runtime": 107.2842, "eval_samples_per_second": 4.8, "eval_steps_per_second": 0.606, "epoch": 2.0, "step": 1175}, {"loss": 1.6184, "grad_norm": 0.40444880723953247, "learning_rate": 0.0002, "epoch": 2.008510638297872, "step": 1180}, {"loss": 1.5221, "grad_norm": 0.3997816741466522, "learning_rate": 0.0002, "epoch": 2.025531914893617, "step": 1190}, {"loss": 1.6022, "grad_norm": 0.4516718089580536, "learning_rate": 0.0002, "epoch": 2.0425531914893615, "step": 1200}, {"loss": 1.5458, "grad_norm": 0.6645553708076477, "learning_rate": 0.0002, "epoch": 2.0595744680851062, "step": 1210}, {"loss": 1.5985, "grad_norm": 0.4181990921497345, "learning_rate": 0.0002, "epoch": 2.076595744680851, "step": 1220}, {"loss": 1.5768, "grad_norm": 0.45681431889533997, "learning_rate": 0.0002, "epoch": 2.0936170212765957, "step": 1230}, {"loss": 1.5598, "grad_norm": 0.48914700746536255, "learning_rate": 0.0002, "epoch": 2.1106382978723404, "step": 1240}, {"loss": 1.6031, "grad_norm": 0.43265485763549805, "learning_rate": 0.0002, "epoch": 2.127659574468085, "step": 1250}, {"loss": 1.5812, "grad_norm": 0.4641207754611969, "learning_rate": 0.0002, "epoch": 2.1446808510638298, "step": 1260}, {"loss": 1.5686, "grad_norm": 0.4840783476829529, "learning_rate": 0.0002, "epoch": 2.1617021276595745, "step": 1270}, {"loss": 1.5969, "grad_norm": 0.4974595308303833, "learning_rate": 0.0002, "epoch": 2.178723404255319, "step": 1280}, {"loss": 1.5512, "grad_norm": 0.5133475661277771, "learning_rate": 0.0002, "epoch": 2.195744680851064, "step": 1290}, {"loss": 1.6467, "grad_norm": 0.5030052065849304, "learning_rate": 0.0002, "epoch": 2.2127659574468086, "step": 1300}, {"loss": 1.6845, "grad_norm": 0.46602481603622437, "learning_rate": 0.0002, "epoch": 2.2297872340425533, "step": 1310}, {"loss": 1.6132, "grad_norm": 0.43662378191947937, "learning_rate": 0.0002, "epoch": 2.246808510638298, "step": 1320}, {"loss": 1.6223, "grad_norm": 0.5137454867362976, "learning_rate": 0.0002, "epoch": 2.2638297872340427, "step": 1330}, {"loss": 1.5702, "grad_norm": 0.4750335216522217, "learning_rate": 0.0002, "epoch": 2.2808510638297874, "step": 1340}, {"loss": 1.6479, "grad_norm": 0.43691426515579224, "learning_rate": 0.0002, "epoch": 2.297872340425532, "step": 1350}, {"loss": 1.5561, "grad_norm": 0.49752047657966614, "learning_rate": 0.0002, "epoch": 2.3148936170212764, "step": 1360}, {"loss": 1.5789, "grad_norm": 0.45101815462112427, "learning_rate": 0.0002, "epoch": 2.331914893617021, "step": 1370}, {"loss": 1.5858, "grad_norm": 0.4427817761898041, "learning_rate": 0.0002, "epoch": 2.348936170212766, "step": 1380}, {"loss": 1.5537, "grad_norm": 0.4802311062812805, "learning_rate": 0.0002, "epoch": 2.3659574468085105, "step": 1390}, {"loss": 1.5846, "grad_norm": 0.4512513279914856, "learning_rate": 0.0002, "epoch": 2.382978723404255, "step": 1400}, {"loss": 1.627, "grad_norm": 0.4878857135772705, "learning_rate": 0.0002, "epoch": 2.4, "step": 1410}, {"loss": 1.5781, "grad_norm": 0.4741315543651581, "learning_rate": 0.0002, "epoch": 2.4170212765957446, "step": 1420}, {"loss": 1.5595, "grad_norm": 0.4770931601524353, "learning_rate": 0.0002, "epoch": 2.4340425531914893, "step": 1430}, {"loss": 1.5336, "grad_norm": 0.5124667286872864, "learning_rate": 0.0002, "epoch": 2.451063829787234, "step": 1440}, {"loss": 1.5811, "grad_norm": 0.45264801383018494, "learning_rate": 0.0002, "epoch": 2.4680851063829787, "step": 1450}, {"loss": 1.604, "grad_norm": 0.5456924438476562, "learning_rate": 0.0002, "epoch": 2.4851063829787234, "step": 1460}, {"loss": 1.6063, "grad_norm": 0.44656285643577576, "learning_rate": 0.0002, "epoch": 2.502127659574468, "step": 1470}, {"loss": 1.6336, "grad_norm": 0.5939419865608215, "learning_rate": 0.0002, "epoch": 2.519148936170213, "step": 1480}, {"loss": 1.5481, "grad_norm": 0.47853362560272217, "learning_rate": 0.0002, "epoch": 2.5361702127659576, "step": 1490}, {"loss": 1.6543, "grad_norm": 0.47643396258354187, "learning_rate": 0.0002, "epoch": 2.5531914893617023, "step": 1500}, {"loss": 1.5085, "grad_norm": 0.4939501881599426, "learning_rate": 0.0002, "epoch": 2.570212765957447, "step": 1510}, {"loss": 1.5659, "grad_norm": 0.502055287361145, "learning_rate": 0.0002, "epoch": 2.5872340425531917, "step": 1520}, {"loss": 1.6519, "grad_norm": 0.463250994682312, "learning_rate": 0.0002, "epoch": 2.604255319148936, "step": 1530}, {"loss": 1.5698, "grad_norm": 0.4761098623275757, "learning_rate": 0.0002, "epoch": 2.621276595744681, "step": 1540}, {"loss": 1.6044, "grad_norm": 0.4687299132347107, "learning_rate": 0.0002, "epoch": 2.6382978723404253, "step": 1550}, {"loss": 1.632, "grad_norm": 0.5536078810691833, "learning_rate": 0.0002, "epoch": 2.65531914893617, "step": 1560}, {"loss": 1.586, "grad_norm": 0.581320583820343, "learning_rate": 0.0002, "epoch": 2.6723404255319148, "step": 1570}, {"loss": 1.508, "grad_norm": 0.45952868461608887, "learning_rate": 0.0002, "epoch": 2.6893617021276595, "step": 1580}, {"loss": 1.5905, "grad_norm": 0.4602586328983307, "learning_rate": 0.0002, "epoch": 2.706382978723404, "step": 1590}, {"loss": 1.6008, "grad_norm": 0.5276554226875305, "learning_rate": 0.0002, "epoch": 2.723404255319149, "step": 1600}, {"loss": 1.7216, "grad_norm": 0.5750249624252319, "learning_rate": 0.0002, "epoch": 2.7404255319148936, "step": 1610}, {"loss": 1.5506, "grad_norm": 0.468723863363266, "learning_rate": 0.0002, "epoch": 2.7574468085106383, "step": 1620}, {"loss": 1.4499, "grad_norm": 0.44649943709373474, "learning_rate": 0.0002, "epoch": 2.774468085106383, "step": 1630}, {"loss": 1.5106, "grad_norm": 0.5097237825393677, "learning_rate": 0.0002, "epoch": 2.7914893617021277, "step": 1640}, {"loss": 1.5948, "grad_norm": 0.46384191513061523, "learning_rate": 0.0002, "epoch": 2.8085106382978724, "step": 1650}, {"loss": 1.5828, "grad_norm": 0.4885474443435669, "learning_rate": 0.0002, "epoch": 2.825531914893617, "step": 1660}, {"loss": 1.5675, "grad_norm": 0.45621681213378906, "learning_rate": 0.0002, "epoch": 2.842553191489362, "step": 1670}, {"loss": 1.6042, "grad_norm": 0.4797150194644928, "learning_rate": 0.0002, "epoch": 2.8595744680851065, "step": 1680}, {"loss": 1.5601, "grad_norm": 0.5142032504081726, "learning_rate": 0.0002, "epoch": 2.876595744680851, "step": 1690}, {"loss": 1.5984, "grad_norm": 0.48939862847328186, "learning_rate": 0.0002, "epoch": 2.8936170212765955, "step": 1700}, {"loss": 1.6333, "grad_norm": 0.4575578272342682, "learning_rate": 0.0002, "epoch": 2.9106382978723406, "step": 1710}, {"loss": 1.5396, "grad_norm": 0.5589063763618469, "learning_rate": 0.0002, "epoch": 2.927659574468085, "step": 1720}, {"loss": 1.6096, "grad_norm": 0.48508813977241516, "learning_rate": 0.0002, "epoch": 2.94468085106383, "step": 1730}, {"loss": 1.5686, "grad_norm": 0.42786726355552673, "learning_rate": 0.0002, "epoch": 2.9617021276595743, "step": 1740}, {"loss": 1.5555, "grad_norm": 0.5598229765892029, "learning_rate": 0.0002, "epoch": 2.978723404255319, "step": 1750}, {"loss": 1.5035, "grad_norm": 0.4779253602027893, "learning_rate": 0.0002, "epoch": 2.9957446808510637, "step": 1760}, {"eval_loss": 1.8543579578399658, "eval_runtime": 107.2363, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.606, "epoch": 2.999148936170213, "step": 1762}, {"loss": 1.4767, "grad_norm": 0.48810940980911255, "learning_rate": 0.0002, "epoch": 3.0127659574468084, "step": 1770}, {"loss": 1.5385, "grad_norm": 0.6194920539855957, "learning_rate": 0.0002, "epoch": 3.029787234042553, "step": 1780}, {"loss": 1.4012, "grad_norm": 0.5875462293624878, "learning_rate": 0.0002, "epoch": 3.046808510638298, "step": 1790}, {"loss": 1.4727, "grad_norm": 0.5775138139724731, "learning_rate": 0.0002, "epoch": 3.0638297872340425, "step": 1800}, {"loss": 1.493, "grad_norm": 0.5445981621742249, "learning_rate": 0.0002, "epoch": 3.0808510638297872, "step": 1810}, {"loss": 1.4247, "grad_norm": 0.6728862524032593, "learning_rate": 0.0002, "epoch": 3.097872340425532, "step": 1820}, {"loss": 1.4303, "grad_norm": 0.6105490326881409, "learning_rate": 0.0002, "epoch": 3.1148936170212767, "step": 1830}, {"loss": 1.5214, "grad_norm": 0.5771165490150452, "learning_rate": 0.0002, "epoch": 3.1319148936170214, "step": 1840}, {"loss": 1.4359, "grad_norm": 0.5778449773788452, "learning_rate": 0.0002, "epoch": 3.148936170212766, "step": 1850}, {"loss": 1.4121, "grad_norm": 0.7141990661621094, "learning_rate": 0.0002, "epoch": 3.1659574468085108, "step": 1860}, {"loss": 1.4904, "grad_norm": 0.5882705450057983, "learning_rate": 0.0002, "epoch": 3.1829787234042555, "step": 1870}, {"loss": 1.4941, "grad_norm": 0.5996195077896118, "learning_rate": 0.0002, "epoch": 3.2, "step": 1880}, {"loss": 1.4519, "grad_norm": 0.6121219396591187, "learning_rate": 0.0002, "epoch": 3.217021276595745, "step": 1890}, {"loss": 1.4586, "grad_norm": 0.6402981281280518, "learning_rate": 0.0002, "epoch": 3.2340425531914896, "step": 1900}, {"loss": 1.3766, "grad_norm": 0.6111783981323242, "learning_rate": 0.0002, "epoch": 3.251063829787234, "step": 1910}, {"loss": 1.4863, "grad_norm": 0.6682435274124146, "learning_rate": 0.0002, "epoch": 3.2680851063829786, "step": 1920}, {"loss": 1.4608, "grad_norm": 0.6530760526657104, "learning_rate": 0.0002, "epoch": 3.2851063829787233, "step": 1930}, {"loss": 1.4422, "grad_norm": 0.6481217741966248, "learning_rate": 0.0002, "epoch": 3.302127659574468, "step": 1940}, {"loss": 1.5158, "grad_norm": 0.6270697116851807, "learning_rate": 0.0002, "epoch": 3.3191489361702127, "step": 1950}, {"loss": 1.4116, "grad_norm": 0.5924492478370667, "learning_rate": 0.0002, "epoch": 3.3361702127659574, "step": 1960}, {"loss": 1.4578, "grad_norm": 0.5803806781768799, "learning_rate": 0.0002, "epoch": 3.353191489361702, "step": 1970}, {"loss": 1.4689, "grad_norm": 0.5754119157791138, "learning_rate": 0.0002, "epoch": 3.370212765957447, "step": 1980}, {"loss": 1.4605, "grad_norm": 0.6717178821563721, "learning_rate": 0.0002, "epoch": 3.3872340425531915, "step": 1990}, {"loss": 1.486, "grad_norm": 0.5955582857131958, "learning_rate": 0.0002, "epoch": 3.404255319148936, "step": 2000}, {"loss": 1.4445, "grad_norm": 0.6965329647064209, "learning_rate": 0.0002, "epoch": 3.421276595744681, "step": 2010}, {"loss": 1.4543, "grad_norm": 0.6321573257446289, "learning_rate": 0.0002, "epoch": 3.4382978723404256, "step": 2020}, {"loss": 1.5383, "grad_norm": 0.5952608585357666, "learning_rate": 0.0002, "epoch": 3.4553191489361703, "step": 2030}, {"loss": 1.4531, "grad_norm": 0.7718905806541443, "learning_rate": 0.0002, "epoch": 3.472340425531915, "step": 2040}, {"loss": 1.4678, "grad_norm": 0.6850892305374146, "learning_rate": 0.0002, "epoch": 3.4893617021276597, "step": 2050}, {"loss": 1.4956, "grad_norm": 0.5638895630836487, "learning_rate": 0.0002, "epoch": 3.506382978723404, "step": 2060}, {"loss": 1.4586, "grad_norm": 0.6148294806480408, "learning_rate": 0.0002, "epoch": 3.523404255319149, "step": 2070}, {"loss": 1.4622, "grad_norm": 0.5895810723304749, "learning_rate": 0.0002, "epoch": 3.5404255319148934, "step": 2080}, {"loss": 1.4341, "grad_norm": 0.6377319693565369, "learning_rate": 0.0002, "epoch": 3.5574468085106385, "step": 2090}, {"loss": 1.5056, "grad_norm": 0.6047691702842712, "learning_rate": 0.0002, "epoch": 3.574468085106383, "step": 2100}, {"loss": 1.4748, "grad_norm": 0.6049593687057495, "learning_rate": 0.0002, "epoch": 3.5914893617021275, "step": 2110}, {"loss": 1.391, "grad_norm": 0.6358312368392944, "learning_rate": 0.0002, "epoch": 3.608510638297872, "step": 2120}, {"loss": 1.4419, "grad_norm": 0.612119197845459, "learning_rate": 0.0002, "epoch": 3.625531914893617, "step": 2130}, {"loss": 1.438, "grad_norm": 0.6788054704666138, "learning_rate": 0.0002, "epoch": 3.6425531914893616, "step": 2140}, {"loss": 1.4295, "grad_norm": 0.6191043853759766, "learning_rate": 0.0002, "epoch": 3.6595744680851063, "step": 2150}, {"loss": 1.4383, "grad_norm": 0.6660051941871643, "learning_rate": 0.0002, "epoch": 3.676595744680851, "step": 2160}, {"loss": 1.4954, "grad_norm": 0.652692973613739, "learning_rate": 0.0002, "epoch": 3.6936170212765957, "step": 2170}, {"loss": 1.5245, "grad_norm": 0.6123467087745667, "learning_rate": 0.0002, "epoch": 3.7106382978723405, "step": 2180}, {"loss": 1.4686, "grad_norm": 0.640021562576294, "learning_rate": 0.0002, "epoch": 3.727659574468085, "step": 2190}, {"loss": 1.4277, "grad_norm": 0.6809179782867432, "learning_rate": 0.0002, "epoch": 3.74468085106383, "step": 2200}, {"loss": 1.4705, "grad_norm": 0.5978420376777649, "learning_rate": 0.0002, "epoch": 3.7617021276595746, "step": 2210}, {"loss": 1.5559, "grad_norm": 0.7038803100585938, "learning_rate": 0.0002, "epoch": 3.7787234042553193, "step": 2220}, {"loss": 1.4691, "grad_norm": 0.5324276089668274, "learning_rate": 0.0002, "epoch": 3.795744680851064, "step": 2230}, {"loss": 1.4696, "grad_norm": 0.6264132857322693, "learning_rate": 0.0002, "epoch": 3.8127659574468087, "step": 2240}, {"loss": 1.4856, "grad_norm": 0.6143888831138611, "learning_rate": 0.0002, "epoch": 3.829787234042553, "step": 2250}, {"loss": 1.535, "grad_norm": 0.6338503360748291, "learning_rate": 0.0002, "epoch": 3.846808510638298, "step": 2260}, {"loss": 1.456, "grad_norm": 0.556882381439209, "learning_rate": 0.0002, "epoch": 3.8638297872340424, "step": 2270}, {"loss": 1.4701, "grad_norm": 0.6323680281639099, "learning_rate": 0.0002, "epoch": 3.8808510638297875, "step": 2280}, {"loss": 1.5333, "grad_norm": 0.7105869054794312, "learning_rate": 0.0002, "epoch": 3.8978723404255318, "step": 2290}, {"loss": 1.4462, "grad_norm": 0.825415849685669, "learning_rate": 0.0002, "epoch": 3.9148936170212765, "step": 2300}, {"loss": 1.5023, "grad_norm": 0.6412091851234436, "learning_rate": 0.0002, "epoch": 3.931914893617021, "step": 2310}, {"loss": 1.3709, "grad_norm": 0.6286490559577942, "learning_rate": 0.0002, "epoch": 3.948936170212766, "step": 2320}, {"loss": 1.4693, "grad_norm": 0.636021077632904, "learning_rate": 0.0002, "epoch": 3.9659574468085106, "step": 2330}, {"loss": 1.4265, "grad_norm": 0.6032362580299377, "learning_rate": 0.0002, "epoch": 3.9829787234042553, "step": 2340}, {"loss": 1.377, "grad_norm": 0.6497282385826111, "learning_rate": 0.0002, "epoch": 4.0, "step": 2350}, {"eval_loss": 1.9081238508224487, "eval_runtime": 106.6404, "eval_samples_per_second": 4.829, "eval_steps_per_second": 0.61, "epoch": 4.0, "step": 2350}, {"loss": 1.317, "grad_norm": 0.6278848648071289, "learning_rate": 0.0002, "epoch": 4.017021276595744, "step": 2360}, {"loss": 1.3229, "grad_norm": 0.8259812593460083, "learning_rate": 0.0002, "epoch": 4.034042553191489, "step": 2370}, {"loss": 1.2776, "grad_norm": 0.7269589304924011, "learning_rate": 0.0002, "epoch": 4.051063829787234, "step": 2380}, {"loss": 1.3668, "grad_norm": 0.7460662126541138, "learning_rate": 0.0002, "epoch": 4.068085106382979, "step": 2390}, {"loss": 1.3096, "grad_norm": 1.2362046241760254, "learning_rate": 0.0002, "epoch": 4.085106382978723, "step": 2400}, {"loss": 1.2906, "grad_norm": 0.7699568867683411, "learning_rate": 0.0002, "epoch": 4.102127659574468, "step": 2410}, {"loss": 1.3005, "grad_norm": 0.8732489347457886, "learning_rate": 0.0002, "epoch": 4.1191489361702125, "step": 2420}, {"loss": 1.2741, "grad_norm": 0.8331889510154724, "learning_rate": 0.0002, "epoch": 4.136170212765958, "step": 2430}, {"loss": 1.1861, "grad_norm": 0.6686427593231201, "learning_rate": 0.0002, "epoch": 4.153191489361702, "step": 2440}, {"loss": 1.316, "grad_norm": 0.906380832195282, "learning_rate": 0.0002, "epoch": 4.170212765957447, "step": 2450}, {"loss": 1.3134, "grad_norm": 0.7269753813743591, "learning_rate": 0.0002, "epoch": 4.187234042553191, "step": 2460}, {"loss": 1.299, "grad_norm": 0.8556067943572998, "learning_rate": 0.0002, "epoch": 4.2042553191489365, "step": 2470}, {"loss": 1.2935, "grad_norm": 0.7076917886734009, "learning_rate": 0.0002, "epoch": 4.221276595744681, "step": 2480}, {"loss": 1.2608, "grad_norm": 0.7596837282180786, "learning_rate": 0.0002, "epoch": 4.238297872340426, "step": 2490}, {"loss": 1.2747, "grad_norm": 0.7790552377700806, "learning_rate": 0.0002, "epoch": 4.25531914893617, "step": 2500}, {"loss": 1.3438, "grad_norm": 0.8205534219741821, "learning_rate": 0.0002, "epoch": 4.272340425531915, "step": 2510}, {"loss": 1.3058, "grad_norm": 0.7892114520072937, "learning_rate": 0.0002, "epoch": 4.2893617021276595, "step": 2520}, {"loss": 1.3662, "grad_norm": 0.8907270431518555, "learning_rate": 0.0002, "epoch": 4.306382978723404, "step": 2530}, {"loss": 1.3168, "grad_norm": 0.821794331073761, "learning_rate": 0.0002, "epoch": 4.323404255319149, "step": 2540}, {"loss": 1.2467, "grad_norm": 0.7305247783660889, "learning_rate": 0.0002, "epoch": 4.340425531914893, "step": 2550}, {"loss": 1.3446, "grad_norm": 0.8639982342720032, "learning_rate": 0.0002, "epoch": 4.357446808510638, "step": 2560}, {"loss": 1.3863, "grad_norm": 0.8883494138717651, "learning_rate": 0.0002, "epoch": 4.374468085106383, "step": 2570}, {"loss": 1.3693, "grad_norm": 0.7611730098724365, "learning_rate": 0.0002, "epoch": 4.391489361702128, "step": 2580}, {"loss": 1.2814, "grad_norm": 0.7793022394180298, "learning_rate": 0.0002, "epoch": 4.408510638297872, "step": 2590}, {"loss": 1.3014, "grad_norm": 0.979060173034668, "learning_rate": 0.0002, "epoch": 4.425531914893617, "step": 2600}, {"loss": 1.3625, "grad_norm": 0.8320847749710083, "learning_rate": 0.0002, "epoch": 4.4425531914893615, "step": 2610}, {"loss": 1.3362, "grad_norm": 0.7481992244720459, "learning_rate": 0.0002, "epoch": 4.459574468085107, "step": 2620}, {"loss": 1.4037, "grad_norm": 0.783770740032196, "learning_rate": 0.0002, "epoch": 4.476595744680851, "step": 2630}, {"loss": 1.3049, "grad_norm": 0.773295521736145, "learning_rate": 0.0002, "epoch": 4.493617021276596, "step": 2640}, {"loss": 1.2739, "grad_norm": 0.9206840991973877, "learning_rate": 0.0002, "epoch": 4.51063829787234, "step": 2650}, {"loss": 1.3248, "grad_norm": 0.8803266882896423, "learning_rate": 0.0002, "epoch": 4.527659574468085, "step": 2660}, {"loss": 1.2898, "grad_norm": 0.9315535426139832, "learning_rate": 0.0002, "epoch": 4.54468085106383, "step": 2670}, {"loss": 1.316, "grad_norm": 0.8610678315162659, "learning_rate": 0.0002, "epoch": 4.561702127659575, "step": 2680}, {"loss": 1.2633, "grad_norm": 0.7405551671981812, "learning_rate": 0.0002, "epoch": 4.578723404255319, "step": 2690}, {"loss": 1.3136, "grad_norm": 1.0238394737243652, "learning_rate": 0.0002, "epoch": 4.595744680851064, "step": 2700}, {"loss": 1.4847, "grad_norm": 0.7814345955848694, "learning_rate": 0.0002, "epoch": 4.6127659574468085, "step": 2710}, {"loss": 1.295, "grad_norm": 0.8436329364776611, "learning_rate": 0.0002, "epoch": 4.629787234042553, "step": 2720}, {"loss": 1.3525, "grad_norm": 0.727214515209198, "learning_rate": 0.0002, "epoch": 4.646808510638298, "step": 2730}, {"loss": 1.3878, "grad_norm": 0.8465878367424011, "learning_rate": 0.0002, "epoch": 4.663829787234042, "step": 2740}, {"loss": 1.278, "grad_norm": 0.8218137621879578, "learning_rate": 0.0002, "epoch": 4.680851063829787, "step": 2750}, {"loss": 1.3628, "grad_norm": 0.7900442481040955, "learning_rate": 0.0002, "epoch": 4.697872340425532, "step": 2760}, {"loss": 1.3494, "grad_norm": 0.8214074969291687, "learning_rate": 0.0002, "epoch": 4.714893617021277, "step": 2770}, {"loss": 1.3954, "grad_norm": 0.7509574890136719, "learning_rate": 0.0002, "epoch": 4.731914893617021, "step": 2780}, {"loss": 1.3693, "grad_norm": 0.7416139245033264, "learning_rate": 0.0002, "epoch": 4.748936170212766, "step": 2790}, {"loss": 1.3045, "grad_norm": 0.8629977107048035, "learning_rate": 0.0002, "epoch": 4.76595744680851, "step": 2800}, {"loss": 1.3164, "grad_norm": 0.8056505918502808, "learning_rate": 0.0002, "epoch": 4.782978723404256, "step": 2810}, {"loss": 1.3056, "grad_norm": 0.7705401182174683, "learning_rate": 0.0002, "epoch": 4.8, "step": 2820}, {"loss": 1.3771, "grad_norm": 1.0173288583755493, "learning_rate": 0.0002, "epoch": 4.817021276595745, "step": 2830}, {"loss": 1.3494, "grad_norm": 0.8375823497772217, "learning_rate": 0.0002, "epoch": 4.834042553191489, "step": 2840}, {"loss": 1.3238, "grad_norm": 0.857073187828064, "learning_rate": 0.0002, "epoch": 4.851063829787234, "step": 2850}, {"loss": 1.2964, "grad_norm": 0.8672189712524414, "learning_rate": 0.0002, "epoch": 4.868085106382979, "step": 2860}, {"loss": 1.3646, "grad_norm": 0.8599910140037537, "learning_rate": 0.0002, "epoch": 4.885106382978723, "step": 2870}, {"loss": 1.3575, "grad_norm": 0.8844674229621887, "learning_rate": 0.0002, "epoch": 4.902127659574468, "step": 2880}, {"loss": 1.285, "grad_norm": 0.8246751427650452, "learning_rate": 0.0002, "epoch": 4.919148936170213, "step": 2890}, {"loss": 1.4116, "grad_norm": 0.8648163676261902, "learning_rate": 0.0002, "epoch": 4.9361702127659575, "step": 2900}, {"loss": 1.2614, "grad_norm": 0.9477900266647339, "learning_rate": 0.0002, "epoch": 4.953191489361702, "step": 2910}, {"loss": 1.3519, "grad_norm": 0.8047965168952942, "learning_rate": 0.0002, "epoch": 4.970212765957447, "step": 2920}, {"loss": 1.3889, "grad_norm": 0.9872494339942932, "learning_rate": 0.0002, "epoch": 4.987234042553191, "step": 2930}]} +{"epoch": 6.0, "step": 3525, "epoch_duration": 1993.1133170127869, "total_accumulated_duration": 12251.378781795502, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5996, "grad_norm": 0.7596228122711182, "learning_rate": 0.0002, "epoch": 0.01702127659574468, "step": 10}, {"loss": 2.2941, "grad_norm": 0.4860903322696686, "learning_rate": 0.0002, "epoch": 0.03404255319148936, "step": 20}, {"loss": 2.0018, "grad_norm": 0.4953401982784271, "learning_rate": 0.0002, "epoch": 0.05106382978723404, "step": 30}, {"loss": 1.9318, "grad_norm": 0.5086901783943176, "learning_rate": 0.0002, "epoch": 0.06808510638297872, "step": 40}, {"loss": 1.8519, "grad_norm": 0.49050021171569824, "learning_rate": 0.0002, "epoch": 0.0851063829787234, "step": 50}, {"loss": 1.8786, "grad_norm": 0.4922358989715576, "learning_rate": 0.0002, "epoch": 0.10212765957446808, "step": 60}, {"loss": 1.8812, "grad_norm": 0.4621541202068329, "learning_rate": 0.0002, "epoch": 0.11914893617021277, "step": 70}, {"loss": 1.8004, "grad_norm": 0.4416729807853699, "learning_rate": 0.0002, "epoch": 0.13617021276595745, "step": 80}, {"loss": 1.9298, "grad_norm": 0.526258111000061, "learning_rate": 0.0002, "epoch": 0.15319148936170213, "step": 90}, {"loss": 1.8339, "grad_norm": 0.44022637605667114, "learning_rate": 0.0002, "epoch": 0.1702127659574468, "step": 100}, {"loss": 1.7681, "grad_norm": 0.4647711515426636, "learning_rate": 0.0002, "epoch": 0.18723404255319148, "step": 110}, {"loss": 1.6953, "grad_norm": 0.4136318564414978, "learning_rate": 0.0002, "epoch": 0.20425531914893616, "step": 120}, {"loss": 1.8491, "grad_norm": 0.39707672595977783, "learning_rate": 0.0002, "epoch": 0.22127659574468084, "step": 130}, {"loss": 1.8017, "grad_norm": 0.4478105306625366, "learning_rate": 0.0002, "epoch": 0.23829787234042554, "step": 140}, {"loss": 1.8067, "grad_norm": 0.4699741303920746, "learning_rate": 0.0002, "epoch": 0.2553191489361702, "step": 150}, {"loss": 1.8161, "grad_norm": 0.4568363130092621, "learning_rate": 0.0002, "epoch": 0.2723404255319149, "step": 160}, {"loss": 1.7491, "grad_norm": 0.45078757405281067, "learning_rate": 0.0002, "epoch": 0.28936170212765955, "step": 170}, {"loss": 1.8115, "grad_norm": 0.4127245843410492, "learning_rate": 0.0002, "epoch": 0.30638297872340425, "step": 180}, {"loss": 1.7707, "grad_norm": 0.4042493402957916, "learning_rate": 0.0002, "epoch": 0.32340425531914896, "step": 190}, {"loss": 1.7807, "grad_norm": 0.401487797498703, "learning_rate": 0.0002, "epoch": 0.3404255319148936, "step": 200}, {"loss": 1.8355, "grad_norm": 0.3959457576274872, "learning_rate": 0.0002, "epoch": 0.3574468085106383, "step": 210}, {"loss": 1.7798, "grad_norm": 0.39865636825561523, "learning_rate": 0.0002, "epoch": 0.37446808510638296, "step": 220}, {"loss": 1.9012, "grad_norm": 0.7225169539451599, "learning_rate": 0.0002, "epoch": 0.39148936170212767, "step": 230}, {"loss": 1.7212, "grad_norm": 0.412801593542099, "learning_rate": 0.0002, "epoch": 0.4085106382978723, "step": 240}, {"loss": 1.8523, "grad_norm": 0.40951448678970337, "learning_rate": 0.0002, "epoch": 0.425531914893617, "step": 250}, {"loss": 1.7283, "grad_norm": 0.42788130044937134, "learning_rate": 0.0002, "epoch": 0.4425531914893617, "step": 260}, {"loss": 1.8051, "grad_norm": 0.41069576144218445, "learning_rate": 0.0002, "epoch": 0.4595744680851064, "step": 270}, {"loss": 1.7861, "grad_norm": 0.3745323717594147, "learning_rate": 0.0002, "epoch": 0.4765957446808511, "step": 280}, {"loss": 1.8484, "grad_norm": 0.3771323263645172, "learning_rate": 0.0002, "epoch": 0.49361702127659574, "step": 290}, {"loss": 1.7417, "grad_norm": 0.34368929266929626, "learning_rate": 0.0002, "epoch": 0.5106382978723404, "step": 300}, {"loss": 1.7394, "grad_norm": 0.4299296736717224, "learning_rate": 0.0002, "epoch": 0.5276595744680851, "step": 310}, {"loss": 1.8255, "grad_norm": 0.4133922755718231, "learning_rate": 0.0002, "epoch": 0.5446808510638298, "step": 320}, {"loss": 1.7043, "grad_norm": 0.3984859585762024, "learning_rate": 0.0002, "epoch": 0.5617021276595745, "step": 330}, {"loss": 1.7873, "grad_norm": 0.3822788894176483, "learning_rate": 0.0002, "epoch": 0.5787234042553191, "step": 340}, {"loss": 1.8082, "grad_norm": 0.4550061821937561, "learning_rate": 0.0002, "epoch": 0.5957446808510638, "step": 350}, {"loss": 1.7179, "grad_norm": 0.36571192741394043, "learning_rate": 0.0002, "epoch": 0.6127659574468085, "step": 360}, {"loss": 1.8196, "grad_norm": 0.32942914962768555, "learning_rate": 0.0002, "epoch": 0.6297872340425532, "step": 370}, {"loss": 1.7118, "grad_norm": 0.39299526810646057, "learning_rate": 0.0002, "epoch": 0.6468085106382979, "step": 380}, {"loss": 1.8179, "grad_norm": 0.3817657232284546, "learning_rate": 0.0002, "epoch": 0.6638297872340425, "step": 390}, {"loss": 1.8174, "grad_norm": 0.3650810122489929, "learning_rate": 0.0002, "epoch": 0.6808510638297872, "step": 400}, {"loss": 1.8026, "grad_norm": 0.3736686408519745, "learning_rate": 0.0002, "epoch": 0.6978723404255319, "step": 410}, {"loss": 1.802, "grad_norm": 0.45680564641952515, "learning_rate": 0.0002, "epoch": 0.7148936170212766, "step": 420}, {"loss": 1.7844, "grad_norm": 0.4154510200023651, "learning_rate": 0.0002, "epoch": 0.7319148936170212, "step": 430}, {"loss": 1.7801, "grad_norm": 0.3701167106628418, "learning_rate": 0.0002, "epoch": 0.7489361702127659, "step": 440}, {"loss": 1.7689, "grad_norm": 0.3869531750679016, "learning_rate": 0.0002, "epoch": 0.7659574468085106, "step": 450}, {"loss": 1.7671, "grad_norm": 0.4391495883464813, "learning_rate": 0.0002, "epoch": 0.7829787234042553, "step": 460}, {"loss": 1.7534, "grad_norm": 0.39652755856513977, "learning_rate": 0.0002, "epoch": 0.8, "step": 470}, {"loss": 1.8076, "grad_norm": 0.4096752107143402, "learning_rate": 0.0002, "epoch": 0.8170212765957446, "step": 480}, {"loss": 1.796, "grad_norm": 0.3857504427433014, "learning_rate": 0.0002, "epoch": 0.8340425531914893, "step": 490}, {"loss": 1.7379, "grad_norm": 0.4105374217033386, "learning_rate": 0.0002, "epoch": 0.851063829787234, "step": 500}, {"loss": 1.6391, "grad_norm": 0.3723328113555908, "learning_rate": 0.0002, "epoch": 0.8680851063829788, "step": 510}, {"loss": 1.7122, "grad_norm": 0.36099690198898315, "learning_rate": 0.0002, "epoch": 0.8851063829787233, "step": 520}, {"loss": 1.7632, "grad_norm": 0.3715187907218933, "learning_rate": 0.0002, "epoch": 0.902127659574468, "step": 530}, {"loss": 1.7004, "grad_norm": 0.4932813048362732, "learning_rate": 0.0002, "epoch": 0.9191489361702128, "step": 540}, {"loss": 1.679, "grad_norm": 0.3493495285511017, "learning_rate": 0.0002, "epoch": 0.9361702127659575, "step": 550}, {"loss": 1.7758, "grad_norm": 0.3598061800003052, "learning_rate": 0.0002, "epoch": 0.9531914893617022, "step": 560}, {"loss": 1.7686, "grad_norm": 0.3521560728549957, "learning_rate": 0.0002, "epoch": 0.9702127659574468, "step": 570}, {"loss": 1.7391, "grad_norm": 0.34150034189224243, "learning_rate": 0.0002, "epoch": 0.9872340425531915, "step": 580}, {"eval_loss": 1.8388911485671997, "eval_runtime": 106.6788, "eval_samples_per_second": 4.828, "eval_steps_per_second": 0.609, "epoch": 0.9991489361702127, "step": 587}, {"loss": 1.6115, "grad_norm": 0.33556103706359863, "learning_rate": 0.0002, "epoch": 1.004255319148936, "step": 590}, {"loss": 1.8346, "grad_norm": 0.4333398640155792, "learning_rate": 0.0002, "epoch": 1.0212765957446808, "step": 600}, {"loss": 1.6656, "grad_norm": 0.38488736748695374, "learning_rate": 0.0002, "epoch": 1.0382978723404255, "step": 610}, {"loss": 1.7778, "grad_norm": 0.44454529881477356, "learning_rate": 0.0002, "epoch": 1.0553191489361702, "step": 620}, {"loss": 1.7143, "grad_norm": 0.3735603392124176, "learning_rate": 0.0002, "epoch": 1.0723404255319149, "step": 630}, {"loss": 1.74, "grad_norm": 0.38912704586982727, "learning_rate": 0.0002, "epoch": 1.0893617021276596, "step": 640}, {"loss": 1.607, "grad_norm": 0.4411826431751251, "learning_rate": 0.0002, "epoch": 1.1063829787234043, "step": 650}, {"loss": 1.6901, "grad_norm": 0.4163050353527069, "learning_rate": 0.0002, "epoch": 1.123404255319149, "step": 660}, {"loss": 1.6722, "grad_norm": 0.4187192916870117, "learning_rate": 0.0002, "epoch": 1.1404255319148937, "step": 670}, {"loss": 1.7061, "grad_norm": 0.3797093629837036, "learning_rate": 0.0002, "epoch": 1.1574468085106382, "step": 680}, {"loss": 1.6409, "grad_norm": 0.4210026264190674, "learning_rate": 0.0002, "epoch": 1.174468085106383, "step": 690}, {"loss": 1.6343, "grad_norm": 0.4701998829841614, "learning_rate": 0.0002, "epoch": 1.1914893617021276, "step": 700}, {"loss": 1.6773, "grad_norm": 0.6331578493118286, "learning_rate": 0.0002, "epoch": 1.2085106382978723, "step": 710}, {"loss": 1.6776, "grad_norm": 0.41908255219459534, "learning_rate": 0.0002, "epoch": 1.225531914893617, "step": 720}, {"loss": 1.6468, "grad_norm": 0.36158403754234314, "learning_rate": 0.0002, "epoch": 1.2425531914893617, "step": 730}, {"loss": 1.746, "grad_norm": 0.387300580739975, "learning_rate": 0.0002, "epoch": 1.2595744680851064, "step": 740}, {"loss": 1.7089, "grad_norm": 0.38899728655815125, "learning_rate": 0.0002, "epoch": 1.2765957446808511, "step": 750}, {"loss": 1.6376, "grad_norm": 0.4549255073070526, "learning_rate": 0.0002, "epoch": 1.2936170212765958, "step": 760}, {"loss": 1.6754, "grad_norm": 0.4052349328994751, "learning_rate": 0.0002, "epoch": 1.3106382978723405, "step": 770}, {"loss": 1.6483, "grad_norm": 0.38934215903282166, "learning_rate": 0.0002, "epoch": 1.327659574468085, "step": 780}, {"loss": 1.7813, "grad_norm": 0.38688382506370544, "learning_rate": 0.0002, "epoch": 1.3446808510638297, "step": 790}, {"loss": 1.6374, "grad_norm": 0.3825705051422119, "learning_rate": 0.0002, "epoch": 1.3617021276595744, "step": 800}, {"loss": 1.655, "grad_norm": 0.37331756949424744, "learning_rate": 0.0002, "epoch": 1.3787234042553191, "step": 810}, {"loss": 1.6321, "grad_norm": 0.38826408982276917, "learning_rate": 0.0002, "epoch": 1.3957446808510638, "step": 820}, {"loss": 1.6107, "grad_norm": 0.4213569164276123, "learning_rate": 0.0002, "epoch": 1.4127659574468086, "step": 830}, {"loss": 1.6792, "grad_norm": 0.3976684808731079, "learning_rate": 0.0002, "epoch": 1.4297872340425533, "step": 840}, {"loss": 1.7029, "grad_norm": 0.390009343624115, "learning_rate": 0.0002, "epoch": 1.4468085106382977, "step": 850}, {"loss": 1.7489, "grad_norm": 0.4462052583694458, "learning_rate": 0.0002, "epoch": 1.4638297872340424, "step": 860}, {"loss": 1.7513, "grad_norm": 0.42129236459732056, "learning_rate": 0.0002, "epoch": 1.4808510638297872, "step": 870}, {"loss": 1.6009, "grad_norm": 0.41489893198013306, "learning_rate": 0.0002, "epoch": 1.4978723404255319, "step": 880}, {"loss": 1.7129, "grad_norm": 0.41451677680015564, "learning_rate": 0.0002, "epoch": 1.5148936170212766, "step": 890}, {"loss": 1.6427, "grad_norm": 0.4477299749851227, "learning_rate": 0.0002, "epoch": 1.5319148936170213, "step": 900}, {"loss": 1.6543, "grad_norm": 0.38476648926734924, "learning_rate": 0.0002, "epoch": 1.548936170212766, "step": 910}, {"loss": 1.7103, "grad_norm": 0.42755743861198425, "learning_rate": 0.0002, "epoch": 1.5659574468085107, "step": 920}, {"loss": 1.7014, "grad_norm": 0.39372023940086365, "learning_rate": 0.0002, "epoch": 1.5829787234042554, "step": 930}, {"loss": 1.7042, "grad_norm": 0.42778754234313965, "learning_rate": 0.0002, "epoch": 1.6, "step": 940}, {"loss": 1.7488, "grad_norm": 0.4217268228530884, "learning_rate": 0.0002, "epoch": 1.6170212765957448, "step": 950}, {"loss": 1.6556, "grad_norm": 0.40452107787132263, "learning_rate": 0.0002, "epoch": 1.6340425531914895, "step": 960}, {"loss": 1.6734, "grad_norm": 0.4259980022907257, "learning_rate": 0.0002, "epoch": 1.6510638297872342, "step": 970}, {"loss": 1.6464, "grad_norm": 0.4089849591255188, "learning_rate": 0.0002, "epoch": 1.6680851063829787, "step": 980}, {"loss": 1.6588, "grad_norm": 0.38276049494743347, "learning_rate": 0.0002, "epoch": 1.6851063829787234, "step": 990}, {"loss": 1.6846, "grad_norm": 0.40361565351486206, "learning_rate": 0.0002, "epoch": 1.702127659574468, "step": 1000}, {"loss": 1.698, "grad_norm": 0.3537807762622833, "learning_rate": 0.0002, "epoch": 1.7191489361702128, "step": 1010}, {"loss": 1.6001, "grad_norm": 0.40288347005844116, "learning_rate": 0.0002, "epoch": 1.7361702127659573, "step": 1020}, {"loss": 1.6375, "grad_norm": 0.4003616273403168, "learning_rate": 0.0002, "epoch": 1.753191489361702, "step": 1030}, {"loss": 1.6067, "grad_norm": 0.3931669592857361, "learning_rate": 0.0002, "epoch": 1.7702127659574467, "step": 1040}, {"loss": 1.6814, "grad_norm": 0.4001635015010834, "learning_rate": 0.0002, "epoch": 1.7872340425531914, "step": 1050}, {"loss": 1.6158, "grad_norm": 0.4139048457145691, "learning_rate": 0.0002, "epoch": 1.804255319148936, "step": 1060}, {"loss": 1.7513, "grad_norm": 0.5044458508491516, "learning_rate": 0.0002, "epoch": 1.8212765957446808, "step": 1070}, {"loss": 1.7317, "grad_norm": 0.4827095568180084, "learning_rate": 0.0002, "epoch": 1.8382978723404255, "step": 1080}, {"loss": 1.6586, "grad_norm": 0.3750515282154083, "learning_rate": 0.0002, "epoch": 1.8553191489361702, "step": 1090}, {"loss": 1.653, "grad_norm": 0.4024597704410553, "learning_rate": 0.0002, "epoch": 1.872340425531915, "step": 1100}, {"loss": 1.7138, "grad_norm": 0.36747241020202637, "learning_rate": 0.0002, "epoch": 1.8893617021276596, "step": 1110}, {"loss": 1.652, "grad_norm": 0.41397711634635925, "learning_rate": 0.0002, "epoch": 1.9063829787234043, "step": 1120}, {"loss": 1.7071, "grad_norm": 0.3960763216018677, "learning_rate": 0.0002, "epoch": 1.923404255319149, "step": 1130}, {"loss": 1.6857, "grad_norm": 0.4533233344554901, "learning_rate": 0.0002, "epoch": 1.9404255319148938, "step": 1140}, {"loss": 1.7168, "grad_norm": 0.38433438539505005, "learning_rate": 0.0002, "epoch": 1.9574468085106385, "step": 1150}, {"loss": 1.7444, "grad_norm": 0.3648812174797058, "learning_rate": 0.0002, "epoch": 1.974468085106383, "step": 1160}, {"loss": 1.6521, "grad_norm": 0.3887176215648651, "learning_rate": 0.0002, "epoch": 1.9914893617021276, "step": 1170}, {"eval_loss": 1.8328146934509277, "eval_runtime": 107.2842, "eval_samples_per_second": 4.8, "eval_steps_per_second": 0.606, "epoch": 2.0, "step": 1175}, {"loss": 1.6184, "grad_norm": 0.40444880723953247, "learning_rate": 0.0002, "epoch": 2.008510638297872, "step": 1180}, {"loss": 1.5221, "grad_norm": 0.3997816741466522, "learning_rate": 0.0002, "epoch": 2.025531914893617, "step": 1190}, {"loss": 1.6022, "grad_norm": 0.4516718089580536, "learning_rate": 0.0002, "epoch": 2.0425531914893615, "step": 1200}, {"loss": 1.5458, "grad_norm": 0.6645553708076477, "learning_rate": 0.0002, "epoch": 2.0595744680851062, "step": 1210}, {"loss": 1.5985, "grad_norm": 0.4181990921497345, "learning_rate": 0.0002, "epoch": 2.076595744680851, "step": 1220}, {"loss": 1.5768, "grad_norm": 0.45681431889533997, "learning_rate": 0.0002, "epoch": 2.0936170212765957, "step": 1230}, {"loss": 1.5598, "grad_norm": 0.48914700746536255, "learning_rate": 0.0002, "epoch": 2.1106382978723404, "step": 1240}, {"loss": 1.6031, "grad_norm": 0.43265485763549805, "learning_rate": 0.0002, "epoch": 2.127659574468085, "step": 1250}, {"loss": 1.5812, "grad_norm": 0.4641207754611969, "learning_rate": 0.0002, "epoch": 2.1446808510638298, "step": 1260}, {"loss": 1.5686, "grad_norm": 0.4840783476829529, "learning_rate": 0.0002, "epoch": 2.1617021276595745, "step": 1270}, {"loss": 1.5969, "grad_norm": 0.4974595308303833, "learning_rate": 0.0002, "epoch": 2.178723404255319, "step": 1280}, {"loss": 1.5512, "grad_norm": 0.5133475661277771, "learning_rate": 0.0002, "epoch": 2.195744680851064, "step": 1290}, {"loss": 1.6467, "grad_norm": 0.5030052065849304, "learning_rate": 0.0002, "epoch": 2.2127659574468086, "step": 1300}, {"loss": 1.6845, "grad_norm": 0.46602481603622437, "learning_rate": 0.0002, "epoch": 2.2297872340425533, "step": 1310}, {"loss": 1.6132, "grad_norm": 0.43662378191947937, "learning_rate": 0.0002, "epoch": 2.246808510638298, "step": 1320}, {"loss": 1.6223, "grad_norm": 0.5137454867362976, "learning_rate": 0.0002, "epoch": 2.2638297872340427, "step": 1330}, {"loss": 1.5702, "grad_norm": 0.4750335216522217, "learning_rate": 0.0002, "epoch": 2.2808510638297874, "step": 1340}, {"loss": 1.6479, "grad_norm": 0.43691426515579224, "learning_rate": 0.0002, "epoch": 2.297872340425532, "step": 1350}, {"loss": 1.5561, "grad_norm": 0.49752047657966614, "learning_rate": 0.0002, "epoch": 2.3148936170212764, "step": 1360}, {"loss": 1.5789, "grad_norm": 0.45101815462112427, "learning_rate": 0.0002, "epoch": 2.331914893617021, "step": 1370}, {"loss": 1.5858, "grad_norm": 0.4427817761898041, "learning_rate": 0.0002, "epoch": 2.348936170212766, "step": 1380}, {"loss": 1.5537, "grad_norm": 0.4802311062812805, "learning_rate": 0.0002, "epoch": 2.3659574468085105, "step": 1390}, {"loss": 1.5846, "grad_norm": 0.4512513279914856, "learning_rate": 0.0002, "epoch": 2.382978723404255, "step": 1400}, {"loss": 1.627, "grad_norm": 0.4878857135772705, "learning_rate": 0.0002, "epoch": 2.4, "step": 1410}, {"loss": 1.5781, "grad_norm": 0.4741315543651581, "learning_rate": 0.0002, "epoch": 2.4170212765957446, "step": 1420}, {"loss": 1.5595, "grad_norm": 0.4770931601524353, "learning_rate": 0.0002, "epoch": 2.4340425531914893, "step": 1430}, {"loss": 1.5336, "grad_norm": 0.5124667286872864, "learning_rate": 0.0002, "epoch": 2.451063829787234, "step": 1440}, {"loss": 1.5811, "grad_norm": 0.45264801383018494, "learning_rate": 0.0002, "epoch": 2.4680851063829787, "step": 1450}, {"loss": 1.604, "grad_norm": 0.5456924438476562, "learning_rate": 0.0002, "epoch": 2.4851063829787234, "step": 1460}, {"loss": 1.6063, "grad_norm": 0.44656285643577576, "learning_rate": 0.0002, "epoch": 2.502127659574468, "step": 1470}, {"loss": 1.6336, "grad_norm": 0.5939419865608215, "learning_rate": 0.0002, "epoch": 2.519148936170213, "step": 1480}, {"loss": 1.5481, "grad_norm": 0.47853362560272217, "learning_rate": 0.0002, "epoch": 2.5361702127659576, "step": 1490}, {"loss": 1.6543, "grad_norm": 0.47643396258354187, "learning_rate": 0.0002, "epoch": 2.5531914893617023, "step": 1500}, {"loss": 1.5085, "grad_norm": 0.4939501881599426, "learning_rate": 0.0002, "epoch": 2.570212765957447, "step": 1510}, {"loss": 1.5659, "grad_norm": 0.502055287361145, "learning_rate": 0.0002, "epoch": 2.5872340425531917, "step": 1520}, {"loss": 1.6519, "grad_norm": 0.463250994682312, "learning_rate": 0.0002, "epoch": 2.604255319148936, "step": 1530}, {"loss": 1.5698, "grad_norm": 0.4761098623275757, "learning_rate": 0.0002, "epoch": 2.621276595744681, "step": 1540}, {"loss": 1.6044, "grad_norm": 0.4687299132347107, "learning_rate": 0.0002, "epoch": 2.6382978723404253, "step": 1550}, {"loss": 1.632, "grad_norm": 0.5536078810691833, "learning_rate": 0.0002, "epoch": 2.65531914893617, "step": 1560}, {"loss": 1.586, "grad_norm": 0.581320583820343, "learning_rate": 0.0002, "epoch": 2.6723404255319148, "step": 1570}, {"loss": 1.508, "grad_norm": 0.45952868461608887, "learning_rate": 0.0002, "epoch": 2.6893617021276595, "step": 1580}, {"loss": 1.5905, "grad_norm": 0.4602586328983307, "learning_rate": 0.0002, "epoch": 2.706382978723404, "step": 1590}, {"loss": 1.6008, "grad_norm": 0.5276554226875305, "learning_rate": 0.0002, "epoch": 2.723404255319149, "step": 1600}, {"loss": 1.7216, "grad_norm": 0.5750249624252319, "learning_rate": 0.0002, "epoch": 2.7404255319148936, "step": 1610}, {"loss": 1.5506, "grad_norm": 0.468723863363266, "learning_rate": 0.0002, "epoch": 2.7574468085106383, "step": 1620}, {"loss": 1.4499, "grad_norm": 0.44649943709373474, "learning_rate": 0.0002, "epoch": 2.774468085106383, "step": 1630}, {"loss": 1.5106, "grad_norm": 0.5097237825393677, "learning_rate": 0.0002, "epoch": 2.7914893617021277, "step": 1640}, {"loss": 1.5948, "grad_norm": 0.46384191513061523, "learning_rate": 0.0002, "epoch": 2.8085106382978724, "step": 1650}, {"loss": 1.5828, "grad_norm": 0.4885474443435669, "learning_rate": 0.0002, "epoch": 2.825531914893617, "step": 1660}, {"loss": 1.5675, "grad_norm": 0.45621681213378906, "learning_rate": 0.0002, "epoch": 2.842553191489362, "step": 1670}, {"loss": 1.6042, "grad_norm": 0.4797150194644928, "learning_rate": 0.0002, "epoch": 2.8595744680851065, "step": 1680}, {"loss": 1.5601, "grad_norm": 0.5142032504081726, "learning_rate": 0.0002, "epoch": 2.876595744680851, "step": 1690}, {"loss": 1.5984, "grad_norm": 0.48939862847328186, "learning_rate": 0.0002, "epoch": 2.8936170212765955, "step": 1700}, {"loss": 1.6333, "grad_norm": 0.4575578272342682, "learning_rate": 0.0002, "epoch": 2.9106382978723406, "step": 1710}, {"loss": 1.5396, "grad_norm": 0.5589063763618469, "learning_rate": 0.0002, "epoch": 2.927659574468085, "step": 1720}, {"loss": 1.6096, "grad_norm": 0.48508813977241516, "learning_rate": 0.0002, "epoch": 2.94468085106383, "step": 1730}, {"loss": 1.5686, "grad_norm": 0.42786726355552673, "learning_rate": 0.0002, "epoch": 2.9617021276595743, "step": 1740}, {"loss": 1.5555, "grad_norm": 0.5598229765892029, "learning_rate": 0.0002, "epoch": 2.978723404255319, "step": 1750}, {"loss": 1.5035, "grad_norm": 0.4779253602027893, "learning_rate": 0.0002, "epoch": 2.9957446808510637, "step": 1760}, {"eval_loss": 1.8543579578399658, "eval_runtime": 107.2363, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.606, "epoch": 2.999148936170213, "step": 1762}, {"loss": 1.4767, "grad_norm": 0.48810940980911255, "learning_rate": 0.0002, "epoch": 3.0127659574468084, "step": 1770}, {"loss": 1.5385, "grad_norm": 0.6194920539855957, "learning_rate": 0.0002, "epoch": 3.029787234042553, "step": 1780}, {"loss": 1.4012, "grad_norm": 0.5875462293624878, "learning_rate": 0.0002, "epoch": 3.046808510638298, "step": 1790}, {"loss": 1.4727, "grad_norm": 0.5775138139724731, "learning_rate": 0.0002, "epoch": 3.0638297872340425, "step": 1800}, {"loss": 1.493, "grad_norm": 0.5445981621742249, "learning_rate": 0.0002, "epoch": 3.0808510638297872, "step": 1810}, {"loss": 1.4247, "grad_norm": 0.6728862524032593, "learning_rate": 0.0002, "epoch": 3.097872340425532, "step": 1820}, {"loss": 1.4303, "grad_norm": 0.6105490326881409, "learning_rate": 0.0002, "epoch": 3.1148936170212767, "step": 1830}, {"loss": 1.5214, "grad_norm": 0.5771165490150452, "learning_rate": 0.0002, "epoch": 3.1319148936170214, "step": 1840}, {"loss": 1.4359, "grad_norm": 0.5778449773788452, "learning_rate": 0.0002, "epoch": 3.148936170212766, "step": 1850}, {"loss": 1.4121, "grad_norm": 0.7141990661621094, "learning_rate": 0.0002, "epoch": 3.1659574468085108, "step": 1860}, {"loss": 1.4904, "grad_norm": 0.5882705450057983, "learning_rate": 0.0002, "epoch": 3.1829787234042555, "step": 1870}, {"loss": 1.4941, "grad_norm": 0.5996195077896118, "learning_rate": 0.0002, "epoch": 3.2, "step": 1880}, {"loss": 1.4519, "grad_norm": 0.6121219396591187, "learning_rate": 0.0002, "epoch": 3.217021276595745, "step": 1890}, {"loss": 1.4586, "grad_norm": 0.6402981281280518, "learning_rate": 0.0002, "epoch": 3.2340425531914896, "step": 1900}, {"loss": 1.3766, "grad_norm": 0.6111783981323242, "learning_rate": 0.0002, "epoch": 3.251063829787234, "step": 1910}, {"loss": 1.4863, "grad_norm": 0.6682435274124146, "learning_rate": 0.0002, "epoch": 3.2680851063829786, "step": 1920}, {"loss": 1.4608, "grad_norm": 0.6530760526657104, "learning_rate": 0.0002, "epoch": 3.2851063829787233, "step": 1930}, {"loss": 1.4422, "grad_norm": 0.6481217741966248, "learning_rate": 0.0002, "epoch": 3.302127659574468, "step": 1940}, {"loss": 1.5158, "grad_norm": 0.6270697116851807, "learning_rate": 0.0002, "epoch": 3.3191489361702127, "step": 1950}, {"loss": 1.4116, "grad_norm": 0.5924492478370667, "learning_rate": 0.0002, "epoch": 3.3361702127659574, "step": 1960}, {"loss": 1.4578, "grad_norm": 0.5803806781768799, "learning_rate": 0.0002, "epoch": 3.353191489361702, "step": 1970}, {"loss": 1.4689, "grad_norm": 0.5754119157791138, "learning_rate": 0.0002, "epoch": 3.370212765957447, "step": 1980}, {"loss": 1.4605, "grad_norm": 0.6717178821563721, "learning_rate": 0.0002, "epoch": 3.3872340425531915, "step": 1990}, {"loss": 1.486, "grad_norm": 0.5955582857131958, "learning_rate": 0.0002, "epoch": 3.404255319148936, "step": 2000}, {"loss": 1.4445, "grad_norm": 0.6965329647064209, "learning_rate": 0.0002, "epoch": 3.421276595744681, "step": 2010}, {"loss": 1.4543, "grad_norm": 0.6321573257446289, "learning_rate": 0.0002, "epoch": 3.4382978723404256, "step": 2020}, {"loss": 1.5383, "grad_norm": 0.5952608585357666, "learning_rate": 0.0002, "epoch": 3.4553191489361703, "step": 2030}, {"loss": 1.4531, "grad_norm": 0.7718905806541443, "learning_rate": 0.0002, "epoch": 3.472340425531915, "step": 2040}, {"loss": 1.4678, "grad_norm": 0.6850892305374146, "learning_rate": 0.0002, "epoch": 3.4893617021276597, "step": 2050}, {"loss": 1.4956, "grad_norm": 0.5638895630836487, "learning_rate": 0.0002, "epoch": 3.506382978723404, "step": 2060}, {"loss": 1.4586, "grad_norm": 0.6148294806480408, "learning_rate": 0.0002, "epoch": 3.523404255319149, "step": 2070}, {"loss": 1.4622, "grad_norm": 0.5895810723304749, "learning_rate": 0.0002, "epoch": 3.5404255319148934, "step": 2080}, {"loss": 1.4341, "grad_norm": 0.6377319693565369, "learning_rate": 0.0002, "epoch": 3.5574468085106385, "step": 2090}, {"loss": 1.5056, "grad_norm": 0.6047691702842712, "learning_rate": 0.0002, "epoch": 3.574468085106383, "step": 2100}, {"loss": 1.4748, "grad_norm": 0.6049593687057495, "learning_rate": 0.0002, "epoch": 3.5914893617021275, "step": 2110}, {"loss": 1.391, "grad_norm": 0.6358312368392944, "learning_rate": 0.0002, "epoch": 3.608510638297872, "step": 2120}, {"loss": 1.4419, "grad_norm": 0.612119197845459, "learning_rate": 0.0002, "epoch": 3.625531914893617, "step": 2130}, {"loss": 1.438, "grad_norm": 0.6788054704666138, "learning_rate": 0.0002, "epoch": 3.6425531914893616, "step": 2140}, {"loss": 1.4295, "grad_norm": 0.6191043853759766, "learning_rate": 0.0002, "epoch": 3.6595744680851063, "step": 2150}, {"loss": 1.4383, "grad_norm": 0.6660051941871643, "learning_rate": 0.0002, "epoch": 3.676595744680851, "step": 2160}, {"loss": 1.4954, "grad_norm": 0.652692973613739, "learning_rate": 0.0002, "epoch": 3.6936170212765957, "step": 2170}, {"loss": 1.5245, "grad_norm": 0.6123467087745667, "learning_rate": 0.0002, "epoch": 3.7106382978723405, "step": 2180}, {"loss": 1.4686, "grad_norm": 0.640021562576294, "learning_rate": 0.0002, "epoch": 3.727659574468085, "step": 2190}, {"loss": 1.4277, "grad_norm": 0.6809179782867432, "learning_rate": 0.0002, "epoch": 3.74468085106383, "step": 2200}, {"loss": 1.4705, "grad_norm": 0.5978420376777649, "learning_rate": 0.0002, "epoch": 3.7617021276595746, "step": 2210}, {"loss": 1.5559, "grad_norm": 0.7038803100585938, "learning_rate": 0.0002, "epoch": 3.7787234042553193, "step": 2220}, {"loss": 1.4691, "grad_norm": 0.5324276089668274, "learning_rate": 0.0002, "epoch": 3.795744680851064, "step": 2230}, {"loss": 1.4696, "grad_norm": 0.6264132857322693, "learning_rate": 0.0002, "epoch": 3.8127659574468087, "step": 2240}, {"loss": 1.4856, "grad_norm": 0.6143888831138611, "learning_rate": 0.0002, "epoch": 3.829787234042553, "step": 2250}, {"loss": 1.535, "grad_norm": 0.6338503360748291, "learning_rate": 0.0002, "epoch": 3.846808510638298, "step": 2260}, {"loss": 1.456, "grad_norm": 0.556882381439209, "learning_rate": 0.0002, "epoch": 3.8638297872340424, "step": 2270}, {"loss": 1.4701, "grad_norm": 0.6323680281639099, "learning_rate": 0.0002, "epoch": 3.8808510638297875, "step": 2280}, {"loss": 1.5333, "grad_norm": 0.7105869054794312, "learning_rate": 0.0002, "epoch": 3.8978723404255318, "step": 2290}, {"loss": 1.4462, "grad_norm": 0.825415849685669, "learning_rate": 0.0002, "epoch": 3.9148936170212765, "step": 2300}, {"loss": 1.5023, "grad_norm": 0.6412091851234436, "learning_rate": 0.0002, "epoch": 3.931914893617021, "step": 2310}, {"loss": 1.3709, "grad_norm": 0.6286490559577942, "learning_rate": 0.0002, "epoch": 3.948936170212766, "step": 2320}, {"loss": 1.4693, "grad_norm": 0.636021077632904, "learning_rate": 0.0002, "epoch": 3.9659574468085106, "step": 2330}, {"loss": 1.4265, "grad_norm": 0.6032362580299377, "learning_rate": 0.0002, "epoch": 3.9829787234042553, "step": 2340}, {"loss": 1.377, "grad_norm": 0.6497282385826111, "learning_rate": 0.0002, "epoch": 4.0, "step": 2350}, {"eval_loss": 1.9081238508224487, "eval_runtime": 106.6404, "eval_samples_per_second": 4.829, "eval_steps_per_second": 0.61, "epoch": 4.0, "step": 2350}, {"loss": 1.317, "grad_norm": 0.6278848648071289, "learning_rate": 0.0002, "epoch": 4.017021276595744, "step": 2360}, {"loss": 1.3229, "grad_norm": 0.8259812593460083, "learning_rate": 0.0002, "epoch": 4.034042553191489, "step": 2370}, {"loss": 1.2776, "grad_norm": 0.7269589304924011, "learning_rate": 0.0002, "epoch": 4.051063829787234, "step": 2380}, {"loss": 1.3668, "grad_norm": 0.7460662126541138, "learning_rate": 0.0002, "epoch": 4.068085106382979, "step": 2390}, {"loss": 1.3096, "grad_norm": 1.2362046241760254, "learning_rate": 0.0002, "epoch": 4.085106382978723, "step": 2400}, {"loss": 1.2906, "grad_norm": 0.7699568867683411, "learning_rate": 0.0002, "epoch": 4.102127659574468, "step": 2410}, {"loss": 1.3005, "grad_norm": 0.8732489347457886, "learning_rate": 0.0002, "epoch": 4.1191489361702125, "step": 2420}, {"loss": 1.2741, "grad_norm": 0.8331889510154724, "learning_rate": 0.0002, "epoch": 4.136170212765958, "step": 2430}, {"loss": 1.1861, "grad_norm": 0.6686427593231201, "learning_rate": 0.0002, "epoch": 4.153191489361702, "step": 2440}, {"loss": 1.316, "grad_norm": 0.906380832195282, "learning_rate": 0.0002, "epoch": 4.170212765957447, "step": 2450}, {"loss": 1.3134, "grad_norm": 0.7269753813743591, "learning_rate": 0.0002, "epoch": 4.187234042553191, "step": 2460}, {"loss": 1.299, "grad_norm": 0.8556067943572998, "learning_rate": 0.0002, "epoch": 4.2042553191489365, "step": 2470}, {"loss": 1.2935, "grad_norm": 0.7076917886734009, "learning_rate": 0.0002, "epoch": 4.221276595744681, "step": 2480}, {"loss": 1.2608, "grad_norm": 0.7596837282180786, "learning_rate": 0.0002, "epoch": 4.238297872340426, "step": 2490}, {"loss": 1.2747, "grad_norm": 0.7790552377700806, "learning_rate": 0.0002, "epoch": 4.25531914893617, "step": 2500}, {"loss": 1.3438, "grad_norm": 0.8205534219741821, "learning_rate": 0.0002, "epoch": 4.272340425531915, "step": 2510}, {"loss": 1.3058, "grad_norm": 0.7892114520072937, "learning_rate": 0.0002, "epoch": 4.2893617021276595, "step": 2520}, {"loss": 1.3662, "grad_norm": 0.8907270431518555, "learning_rate": 0.0002, "epoch": 4.306382978723404, "step": 2530}, {"loss": 1.3168, "grad_norm": 0.821794331073761, "learning_rate": 0.0002, "epoch": 4.323404255319149, "step": 2540}, {"loss": 1.2467, "grad_norm": 0.7305247783660889, "learning_rate": 0.0002, "epoch": 4.340425531914893, "step": 2550}, {"loss": 1.3446, "grad_norm": 0.8639982342720032, "learning_rate": 0.0002, "epoch": 4.357446808510638, "step": 2560}, {"loss": 1.3863, "grad_norm": 0.8883494138717651, "learning_rate": 0.0002, "epoch": 4.374468085106383, "step": 2570}, {"loss": 1.3693, "grad_norm": 0.7611730098724365, "learning_rate": 0.0002, "epoch": 4.391489361702128, "step": 2580}, {"loss": 1.2814, "grad_norm": 0.7793022394180298, "learning_rate": 0.0002, "epoch": 4.408510638297872, "step": 2590}, {"loss": 1.3014, "grad_norm": 0.979060173034668, "learning_rate": 0.0002, "epoch": 4.425531914893617, "step": 2600}, {"loss": 1.3625, "grad_norm": 0.8320847749710083, "learning_rate": 0.0002, "epoch": 4.4425531914893615, "step": 2610}, {"loss": 1.3362, "grad_norm": 0.7481992244720459, "learning_rate": 0.0002, "epoch": 4.459574468085107, "step": 2620}, {"loss": 1.4037, "grad_norm": 0.783770740032196, "learning_rate": 0.0002, "epoch": 4.476595744680851, "step": 2630}, {"loss": 1.3049, "grad_norm": 0.773295521736145, "learning_rate": 0.0002, "epoch": 4.493617021276596, "step": 2640}, {"loss": 1.2739, "grad_norm": 0.9206840991973877, "learning_rate": 0.0002, "epoch": 4.51063829787234, "step": 2650}, {"loss": 1.3248, "grad_norm": 0.8803266882896423, "learning_rate": 0.0002, "epoch": 4.527659574468085, "step": 2660}, {"loss": 1.2898, "grad_norm": 0.9315535426139832, "learning_rate": 0.0002, "epoch": 4.54468085106383, "step": 2670}, {"loss": 1.316, "grad_norm": 0.8610678315162659, "learning_rate": 0.0002, "epoch": 4.561702127659575, "step": 2680}, {"loss": 1.2633, "grad_norm": 0.7405551671981812, "learning_rate": 0.0002, "epoch": 4.578723404255319, "step": 2690}, {"loss": 1.3136, "grad_norm": 1.0238394737243652, "learning_rate": 0.0002, "epoch": 4.595744680851064, "step": 2700}, {"loss": 1.4847, "grad_norm": 0.7814345955848694, "learning_rate": 0.0002, "epoch": 4.6127659574468085, "step": 2710}, {"loss": 1.295, "grad_norm": 0.8436329364776611, "learning_rate": 0.0002, "epoch": 4.629787234042553, "step": 2720}, {"loss": 1.3525, "grad_norm": 0.727214515209198, "learning_rate": 0.0002, "epoch": 4.646808510638298, "step": 2730}, {"loss": 1.3878, "grad_norm": 0.8465878367424011, "learning_rate": 0.0002, "epoch": 4.663829787234042, "step": 2740}, {"loss": 1.278, "grad_norm": 0.8218137621879578, "learning_rate": 0.0002, "epoch": 4.680851063829787, "step": 2750}, {"loss": 1.3628, "grad_norm": 0.7900442481040955, "learning_rate": 0.0002, "epoch": 4.697872340425532, "step": 2760}, {"loss": 1.3494, "grad_norm": 0.8214074969291687, "learning_rate": 0.0002, "epoch": 4.714893617021277, "step": 2770}, {"loss": 1.3954, "grad_norm": 0.7509574890136719, "learning_rate": 0.0002, "epoch": 4.731914893617021, "step": 2780}, {"loss": 1.3693, "grad_norm": 0.7416139245033264, "learning_rate": 0.0002, "epoch": 4.748936170212766, "step": 2790}, {"loss": 1.3045, "grad_norm": 0.8629977107048035, "learning_rate": 0.0002, "epoch": 4.76595744680851, "step": 2800}, {"loss": 1.3164, "grad_norm": 0.8056505918502808, "learning_rate": 0.0002, "epoch": 4.782978723404256, "step": 2810}, {"loss": 1.3056, "grad_norm": 0.7705401182174683, "learning_rate": 0.0002, "epoch": 4.8, "step": 2820}, {"loss": 1.3771, "grad_norm": 1.0173288583755493, "learning_rate": 0.0002, "epoch": 4.817021276595745, "step": 2830}, {"loss": 1.3494, "grad_norm": 0.8375823497772217, "learning_rate": 0.0002, "epoch": 4.834042553191489, "step": 2840}, {"loss": 1.3238, "grad_norm": 0.857073187828064, "learning_rate": 0.0002, "epoch": 4.851063829787234, "step": 2850}, {"loss": 1.2964, "grad_norm": 0.8672189712524414, "learning_rate": 0.0002, "epoch": 4.868085106382979, "step": 2860}, {"loss": 1.3646, "grad_norm": 0.8599910140037537, "learning_rate": 0.0002, "epoch": 4.885106382978723, "step": 2870}, {"loss": 1.3575, "grad_norm": 0.8844674229621887, "learning_rate": 0.0002, "epoch": 4.902127659574468, "step": 2880}, {"loss": 1.285, "grad_norm": 0.8246751427650452, "learning_rate": 0.0002, "epoch": 4.919148936170213, "step": 2890}, {"loss": 1.4116, "grad_norm": 0.8648163676261902, "learning_rate": 0.0002, "epoch": 4.9361702127659575, "step": 2900}, {"loss": 1.2614, "grad_norm": 0.9477900266647339, "learning_rate": 0.0002, "epoch": 4.953191489361702, "step": 2910}, {"loss": 1.3519, "grad_norm": 0.8047965168952942, "learning_rate": 0.0002, "epoch": 4.970212765957447, "step": 2920}, {"loss": 1.3889, "grad_norm": 0.9872494339942932, "learning_rate": 0.0002, "epoch": 4.987234042553191, "step": 2930}, {"eval_loss": 1.9836769104003906, "eval_runtime": 106.4655, "eval_samples_per_second": 4.837, "eval_steps_per_second": 0.611, "epoch": 4.999148936170212, "step": 2937}, {"loss": 1.2574, "grad_norm": 0.7292938828468323, "learning_rate": 0.0002, "epoch": 5.004255319148936, "step": 2940}, {"loss": 1.1312, "grad_norm": 0.8610548973083496, "learning_rate": 0.0002, "epoch": 5.0212765957446805, "step": 2950}, {"loss": 1.1105, "grad_norm": 0.8384576439857483, "learning_rate": 0.0002, "epoch": 5.038297872340426, "step": 2960}, {"loss": 1.1412, "grad_norm": 0.9746620059013367, "learning_rate": 0.0002, "epoch": 5.05531914893617, "step": 2970}, {"loss": 1.1687, "grad_norm": 0.8879048228263855, "learning_rate": 0.0002, "epoch": 5.072340425531915, "step": 2980}, {"loss": 1.1333, "grad_norm": 0.9006168246269226, "learning_rate": 0.0002, "epoch": 5.089361702127659, "step": 2990}, {"loss": 1.1659, "grad_norm": 0.9770249128341675, "learning_rate": 0.0002, "epoch": 5.1063829787234045, "step": 3000}, {"loss": 1.1334, "grad_norm": 1.267967939376831, "learning_rate": 0.0002, "epoch": 5.123404255319149, "step": 3010}, {"loss": 1.2095, "grad_norm": 0.9857587218284607, "learning_rate": 0.0002, "epoch": 5.140425531914894, "step": 3020}, {"loss": 1.0889, "grad_norm": 1.2938690185546875, "learning_rate": 0.0002, "epoch": 5.157446808510638, "step": 3030}, {"loss": 1.1645, "grad_norm": 0.8928244113922119, "learning_rate": 0.0002, "epoch": 5.174468085106383, "step": 3040}, {"loss": 1.1553, "grad_norm": 1.1087630987167358, "learning_rate": 0.0002, "epoch": 5.191489361702128, "step": 3050}, {"loss": 1.1416, "grad_norm": 0.9431360960006714, "learning_rate": 0.0002, "epoch": 5.208510638297873, "step": 3060}, {"loss": 1.1635, "grad_norm": 1.2048338651657104, "learning_rate": 0.0002, "epoch": 5.225531914893617, "step": 3070}, {"loss": 1.171, "grad_norm": 1.0017054080963135, "learning_rate": 0.0002, "epoch": 5.242553191489361, "step": 3080}, {"loss": 1.2212, "grad_norm": 1.2771434783935547, "learning_rate": 0.0002, "epoch": 5.259574468085106, "step": 3090}, {"loss": 1.1478, "grad_norm": 1.4307383298873901, "learning_rate": 0.0002, "epoch": 5.276595744680851, "step": 3100}, {"loss": 1.2132, "grad_norm": 1.2460752725601196, "learning_rate": 0.0002, "epoch": 5.293617021276596, "step": 3110}, {"loss": 1.235, "grad_norm": 1.693974494934082, "learning_rate": 0.0002, "epoch": 5.31063829787234, "step": 3120}, {"loss": 1.1961, "grad_norm": 0.9855408668518066, "learning_rate": 0.0002, "epoch": 5.327659574468085, "step": 3130}, {"loss": 1.2068, "grad_norm": 1.307521104812622, "learning_rate": 0.0002, "epoch": 5.3446808510638295, "step": 3140}, {"loss": 1.2144, "grad_norm": 0.957661509513855, "learning_rate": 0.0002, "epoch": 5.361702127659575, "step": 3150}, {"loss": 1.1305, "grad_norm": 0.870373010635376, "learning_rate": 0.0002, "epoch": 5.378723404255319, "step": 3160}, {"loss": 1.2196, "grad_norm": 0.9324309229850769, "learning_rate": 0.0002, "epoch": 5.395744680851064, "step": 3170}, {"loss": 1.1691, "grad_norm": 1.0142403841018677, "learning_rate": 0.0002, "epoch": 5.412765957446808, "step": 3180}, {"loss": 1.1788, "grad_norm": 0.9759578704833984, "learning_rate": 0.0002, "epoch": 5.4297872340425535, "step": 3190}, {"loss": 1.1321, "grad_norm": 0.9021993279457092, "learning_rate": 0.0002, "epoch": 5.446808510638298, "step": 3200}, {"loss": 1.2222, "grad_norm": 1.007728934288025, "learning_rate": 0.0002, "epoch": 5.463829787234043, "step": 3210}, {"loss": 1.1517, "grad_norm": 0.8969265222549438, "learning_rate": 0.0002, "epoch": 5.480851063829787, "step": 3220}, {"loss": 1.2061, "grad_norm": 0.9672483801841736, "learning_rate": 0.0002, "epoch": 5.497872340425532, "step": 3230}, {"loss": 1.1454, "grad_norm": 1.1417138576507568, "learning_rate": 0.0002, "epoch": 5.514893617021277, "step": 3240}, {"loss": 1.1871, "grad_norm": 0.9669530391693115, "learning_rate": 0.0002, "epoch": 5.531914893617021, "step": 3250}, {"loss": 1.1382, "grad_norm": 1.0161820650100708, "learning_rate": 0.0002, "epoch": 5.548936170212766, "step": 3260}, {"loss": 1.1708, "grad_norm": 0.9935774803161621, "learning_rate": 0.0002, "epoch": 5.565957446808511, "step": 3270}, {"loss": 1.1384, "grad_norm": 1.2572048902511597, "learning_rate": 0.0002, "epoch": 5.582978723404255, "step": 3280}, {"loss": 1.1711, "grad_norm": 0.9614662528038025, "learning_rate": 0.0002, "epoch": 5.6, "step": 3290}, {"loss": 1.219, "grad_norm": 0.9835584163665771, "learning_rate": 0.0002, "epoch": 5.617021276595745, "step": 3300}, {"loss": 1.2074, "grad_norm": 0.9387389421463013, "learning_rate": 0.0002, "epoch": 5.634042553191489, "step": 3310}, {"loss": 1.1148, "grad_norm": 0.9348428249359131, "learning_rate": 0.0002, "epoch": 5.651063829787234, "step": 3320}, {"loss": 1.2378, "grad_norm": 0.9636440873146057, "learning_rate": 0.0002, "epoch": 5.6680851063829785, "step": 3330}, {"loss": 1.2068, "grad_norm": 0.995894193649292, "learning_rate": 0.0002, "epoch": 5.685106382978724, "step": 3340}, {"loss": 1.1443, "grad_norm": 1.0357023477554321, "learning_rate": 0.0002, "epoch": 5.702127659574468, "step": 3350}, {"loss": 1.2209, "grad_norm": 1.0254428386688232, "learning_rate": 0.0002, "epoch": 5.719148936170213, "step": 3360}, {"loss": 1.1987, "grad_norm": 0.8993342518806458, "learning_rate": 0.0002, "epoch": 5.736170212765957, "step": 3370}, {"loss": 1.1527, "grad_norm": 0.9104585647583008, "learning_rate": 0.0002, "epoch": 5.753191489361702, "step": 3380}, {"loss": 1.2268, "grad_norm": 0.9555654525756836, "learning_rate": 0.0002, "epoch": 5.770212765957447, "step": 3390}, {"loss": 1.193, "grad_norm": 0.920124351978302, "learning_rate": 0.0002, "epoch": 5.787234042553192, "step": 3400}, {"loss": 1.2263, "grad_norm": 0.999706506729126, "learning_rate": 0.0002, "epoch": 5.804255319148936, "step": 3410}, {"loss": 1.1411, "grad_norm": 0.9292707443237305, "learning_rate": 0.0002, "epoch": 5.821276595744681, "step": 3420}, {"loss": 1.1507, "grad_norm": 1.0074706077575684, "learning_rate": 0.0002, "epoch": 5.8382978723404255, "step": 3430}, {"loss": 1.2709, "grad_norm": 1.0279479026794434, "learning_rate": 0.0002, "epoch": 5.85531914893617, "step": 3440}, {"loss": 1.1992, "grad_norm": 1.0026037693023682, "learning_rate": 0.0002, "epoch": 5.872340425531915, "step": 3450}, {"loss": 1.1416, "grad_norm": 1.0356525182724, "learning_rate": 0.0002, "epoch": 5.889361702127659, "step": 3460}, {"loss": 1.224, "grad_norm": 1.1106643676757812, "learning_rate": 0.0002, "epoch": 5.906382978723404, "step": 3470}, {"loss": 1.1955, "grad_norm": 0.9578408002853394, "learning_rate": 0.0002, "epoch": 5.923404255319149, "step": 3480}, {"loss": 1.2133, "grad_norm": 1.0225932598114014, "learning_rate": 0.0002, "epoch": 5.940425531914894, "step": 3490}, {"loss": 1.157, "grad_norm": 0.9677667021751404, "learning_rate": 0.0002, "epoch": 5.957446808510638, "step": 3500}, {"loss": 1.2196, "grad_norm": 1.0967241525650024, "learning_rate": 0.0002, "epoch": 5.974468085106383, "step": 3510}, {"loss": 1.1807, "grad_norm": 1.2497339248657227, "learning_rate": 0.0002, "epoch": 5.991489361702127, "step": 3520}]} +{"epoch": 6.999148936170212, "step": 4112, "epoch_duration": 2041.9944262504578, "total_accumulated_duration": 14293.37320804596, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5996, "grad_norm": 0.7596228122711182, "learning_rate": 0.0002, "epoch": 0.01702127659574468, "step": 10}, {"loss": 2.2941, "grad_norm": 0.4860903322696686, "learning_rate": 0.0002, "epoch": 0.03404255319148936, "step": 20}, {"loss": 2.0018, "grad_norm": 0.4953401982784271, "learning_rate": 0.0002, "epoch": 0.05106382978723404, "step": 30}, {"loss": 1.9318, "grad_norm": 0.5086901783943176, "learning_rate": 0.0002, "epoch": 0.06808510638297872, "step": 40}, {"loss": 1.8519, "grad_norm": 0.49050021171569824, "learning_rate": 0.0002, "epoch": 0.0851063829787234, "step": 50}, {"loss": 1.8786, "grad_norm": 0.4922358989715576, "learning_rate": 0.0002, "epoch": 0.10212765957446808, "step": 60}, {"loss": 1.8812, "grad_norm": 0.4621541202068329, "learning_rate": 0.0002, "epoch": 0.11914893617021277, "step": 70}, {"loss": 1.8004, "grad_norm": 0.4416729807853699, "learning_rate": 0.0002, "epoch": 0.13617021276595745, "step": 80}, {"loss": 1.9298, "grad_norm": 0.526258111000061, "learning_rate": 0.0002, "epoch": 0.15319148936170213, "step": 90}, {"loss": 1.8339, "grad_norm": 0.44022637605667114, "learning_rate": 0.0002, "epoch": 0.1702127659574468, "step": 100}, {"loss": 1.7681, "grad_norm": 0.4647711515426636, "learning_rate": 0.0002, "epoch": 0.18723404255319148, "step": 110}, {"loss": 1.6953, "grad_norm": 0.4136318564414978, "learning_rate": 0.0002, "epoch": 0.20425531914893616, "step": 120}, {"loss": 1.8491, "grad_norm": 0.39707672595977783, "learning_rate": 0.0002, "epoch": 0.22127659574468084, "step": 130}, {"loss": 1.8017, "grad_norm": 0.4478105306625366, "learning_rate": 0.0002, "epoch": 0.23829787234042554, "step": 140}, {"loss": 1.8067, "grad_norm": 0.4699741303920746, "learning_rate": 0.0002, "epoch": 0.2553191489361702, "step": 150}, {"loss": 1.8161, "grad_norm": 0.4568363130092621, "learning_rate": 0.0002, "epoch": 0.2723404255319149, "step": 160}, {"loss": 1.7491, "grad_norm": 0.45078757405281067, "learning_rate": 0.0002, "epoch": 0.28936170212765955, "step": 170}, {"loss": 1.8115, "grad_norm": 0.4127245843410492, "learning_rate": 0.0002, "epoch": 0.30638297872340425, "step": 180}, {"loss": 1.7707, "grad_norm": 0.4042493402957916, "learning_rate": 0.0002, "epoch": 0.32340425531914896, "step": 190}, {"loss": 1.7807, "grad_norm": 0.401487797498703, "learning_rate": 0.0002, "epoch": 0.3404255319148936, "step": 200}, {"loss": 1.8355, "grad_norm": 0.3959457576274872, "learning_rate": 0.0002, "epoch": 0.3574468085106383, "step": 210}, {"loss": 1.7798, "grad_norm": 0.39865636825561523, "learning_rate": 0.0002, "epoch": 0.37446808510638296, "step": 220}, {"loss": 1.9012, "grad_norm": 0.7225169539451599, "learning_rate": 0.0002, "epoch": 0.39148936170212767, "step": 230}, {"loss": 1.7212, "grad_norm": 0.412801593542099, "learning_rate": 0.0002, "epoch": 0.4085106382978723, "step": 240}, {"loss": 1.8523, "grad_norm": 0.40951448678970337, "learning_rate": 0.0002, "epoch": 0.425531914893617, "step": 250}, {"loss": 1.7283, "grad_norm": 0.42788130044937134, "learning_rate": 0.0002, "epoch": 0.4425531914893617, "step": 260}, {"loss": 1.8051, "grad_norm": 0.41069576144218445, "learning_rate": 0.0002, "epoch": 0.4595744680851064, "step": 270}, {"loss": 1.7861, "grad_norm": 0.3745323717594147, "learning_rate": 0.0002, "epoch": 0.4765957446808511, "step": 280}, {"loss": 1.8484, "grad_norm": 0.3771323263645172, "learning_rate": 0.0002, "epoch": 0.49361702127659574, "step": 290}, {"loss": 1.7417, "grad_norm": 0.34368929266929626, "learning_rate": 0.0002, "epoch": 0.5106382978723404, "step": 300}, {"loss": 1.7394, "grad_norm": 0.4299296736717224, "learning_rate": 0.0002, "epoch": 0.5276595744680851, "step": 310}, {"loss": 1.8255, "grad_norm": 0.4133922755718231, "learning_rate": 0.0002, "epoch": 0.5446808510638298, "step": 320}, {"loss": 1.7043, "grad_norm": 0.3984859585762024, "learning_rate": 0.0002, "epoch": 0.5617021276595745, "step": 330}, {"loss": 1.7873, "grad_norm": 0.3822788894176483, "learning_rate": 0.0002, "epoch": 0.5787234042553191, "step": 340}, {"loss": 1.8082, "grad_norm": 0.4550061821937561, "learning_rate": 0.0002, "epoch": 0.5957446808510638, "step": 350}, {"loss": 1.7179, "grad_norm": 0.36571192741394043, "learning_rate": 0.0002, "epoch": 0.6127659574468085, "step": 360}, {"loss": 1.8196, "grad_norm": 0.32942914962768555, "learning_rate": 0.0002, "epoch": 0.6297872340425532, "step": 370}, {"loss": 1.7118, "grad_norm": 0.39299526810646057, "learning_rate": 0.0002, "epoch": 0.6468085106382979, "step": 380}, {"loss": 1.8179, "grad_norm": 0.3817657232284546, "learning_rate": 0.0002, "epoch": 0.6638297872340425, "step": 390}, {"loss": 1.8174, "grad_norm": 0.3650810122489929, "learning_rate": 0.0002, "epoch": 0.6808510638297872, "step": 400}, {"loss": 1.8026, "grad_norm": 0.3736686408519745, "learning_rate": 0.0002, "epoch": 0.6978723404255319, "step": 410}, {"loss": 1.802, "grad_norm": 0.45680564641952515, "learning_rate": 0.0002, "epoch": 0.7148936170212766, "step": 420}, {"loss": 1.7844, "grad_norm": 0.4154510200023651, "learning_rate": 0.0002, "epoch": 0.7319148936170212, "step": 430}, {"loss": 1.7801, "grad_norm": 0.3701167106628418, "learning_rate": 0.0002, "epoch": 0.7489361702127659, "step": 440}, {"loss": 1.7689, "grad_norm": 0.3869531750679016, "learning_rate": 0.0002, "epoch": 0.7659574468085106, "step": 450}, {"loss": 1.7671, "grad_norm": 0.4391495883464813, "learning_rate": 0.0002, "epoch": 0.7829787234042553, "step": 460}, {"loss": 1.7534, "grad_norm": 0.39652755856513977, "learning_rate": 0.0002, "epoch": 0.8, "step": 470}, {"loss": 1.8076, "grad_norm": 0.4096752107143402, "learning_rate": 0.0002, "epoch": 0.8170212765957446, "step": 480}, {"loss": 1.796, "grad_norm": 0.3857504427433014, "learning_rate": 0.0002, "epoch": 0.8340425531914893, "step": 490}, {"loss": 1.7379, "grad_norm": 0.4105374217033386, "learning_rate": 0.0002, "epoch": 0.851063829787234, "step": 500}, {"loss": 1.6391, "grad_norm": 0.3723328113555908, "learning_rate": 0.0002, "epoch": 0.8680851063829788, "step": 510}, {"loss": 1.7122, "grad_norm": 0.36099690198898315, "learning_rate": 0.0002, "epoch": 0.8851063829787233, "step": 520}, {"loss": 1.7632, "grad_norm": 0.3715187907218933, "learning_rate": 0.0002, "epoch": 0.902127659574468, "step": 530}, {"loss": 1.7004, "grad_norm": 0.4932813048362732, "learning_rate": 0.0002, "epoch": 0.9191489361702128, "step": 540}, {"loss": 1.679, "grad_norm": 0.3493495285511017, "learning_rate": 0.0002, "epoch": 0.9361702127659575, "step": 550}, {"loss": 1.7758, "grad_norm": 0.3598061800003052, "learning_rate": 0.0002, "epoch": 0.9531914893617022, "step": 560}, {"loss": 1.7686, "grad_norm": 0.3521560728549957, "learning_rate": 0.0002, "epoch": 0.9702127659574468, "step": 570}, {"loss": 1.7391, "grad_norm": 0.34150034189224243, "learning_rate": 0.0002, "epoch": 0.9872340425531915, "step": 580}, {"eval_loss": 1.8388911485671997, "eval_runtime": 106.6788, "eval_samples_per_second": 4.828, "eval_steps_per_second": 0.609, "epoch": 0.9991489361702127, "step": 587}, {"loss": 1.6115, "grad_norm": 0.33556103706359863, "learning_rate": 0.0002, "epoch": 1.004255319148936, "step": 590}, {"loss": 1.8346, "grad_norm": 0.4333398640155792, "learning_rate": 0.0002, "epoch": 1.0212765957446808, "step": 600}, {"loss": 1.6656, "grad_norm": 0.38488736748695374, "learning_rate": 0.0002, "epoch": 1.0382978723404255, "step": 610}, {"loss": 1.7778, "grad_norm": 0.44454529881477356, "learning_rate": 0.0002, "epoch": 1.0553191489361702, "step": 620}, {"loss": 1.7143, "grad_norm": 0.3735603392124176, "learning_rate": 0.0002, "epoch": 1.0723404255319149, "step": 630}, {"loss": 1.74, "grad_norm": 0.38912704586982727, "learning_rate": 0.0002, "epoch": 1.0893617021276596, "step": 640}, {"loss": 1.607, "grad_norm": 0.4411826431751251, "learning_rate": 0.0002, "epoch": 1.1063829787234043, "step": 650}, {"loss": 1.6901, "grad_norm": 0.4163050353527069, "learning_rate": 0.0002, "epoch": 1.123404255319149, "step": 660}, {"loss": 1.6722, "grad_norm": 0.4187192916870117, "learning_rate": 0.0002, "epoch": 1.1404255319148937, "step": 670}, {"loss": 1.7061, "grad_norm": 0.3797093629837036, "learning_rate": 0.0002, "epoch": 1.1574468085106382, "step": 680}, {"loss": 1.6409, "grad_norm": 0.4210026264190674, "learning_rate": 0.0002, "epoch": 1.174468085106383, "step": 690}, {"loss": 1.6343, "grad_norm": 0.4701998829841614, "learning_rate": 0.0002, "epoch": 1.1914893617021276, "step": 700}, {"loss": 1.6773, "grad_norm": 0.6331578493118286, "learning_rate": 0.0002, "epoch": 1.2085106382978723, "step": 710}, {"loss": 1.6776, "grad_norm": 0.41908255219459534, "learning_rate": 0.0002, "epoch": 1.225531914893617, "step": 720}, {"loss": 1.6468, "grad_norm": 0.36158403754234314, "learning_rate": 0.0002, "epoch": 1.2425531914893617, "step": 730}, {"loss": 1.746, "grad_norm": 0.387300580739975, "learning_rate": 0.0002, "epoch": 1.2595744680851064, "step": 740}, {"loss": 1.7089, "grad_norm": 0.38899728655815125, "learning_rate": 0.0002, "epoch": 1.2765957446808511, "step": 750}, {"loss": 1.6376, "grad_norm": 0.4549255073070526, "learning_rate": 0.0002, "epoch": 1.2936170212765958, "step": 760}, {"loss": 1.6754, "grad_norm": 0.4052349328994751, "learning_rate": 0.0002, "epoch": 1.3106382978723405, "step": 770}, {"loss": 1.6483, "grad_norm": 0.38934215903282166, "learning_rate": 0.0002, "epoch": 1.327659574468085, "step": 780}, {"loss": 1.7813, "grad_norm": 0.38688382506370544, "learning_rate": 0.0002, "epoch": 1.3446808510638297, "step": 790}, {"loss": 1.6374, "grad_norm": 0.3825705051422119, "learning_rate": 0.0002, "epoch": 1.3617021276595744, "step": 800}, {"loss": 1.655, "grad_norm": 0.37331756949424744, "learning_rate": 0.0002, "epoch": 1.3787234042553191, "step": 810}, {"loss": 1.6321, "grad_norm": 0.38826408982276917, "learning_rate": 0.0002, "epoch": 1.3957446808510638, "step": 820}, {"loss": 1.6107, "grad_norm": 0.4213569164276123, "learning_rate": 0.0002, "epoch": 1.4127659574468086, "step": 830}, {"loss": 1.6792, "grad_norm": 0.3976684808731079, "learning_rate": 0.0002, "epoch": 1.4297872340425533, "step": 840}, {"loss": 1.7029, "grad_norm": 0.390009343624115, "learning_rate": 0.0002, "epoch": 1.4468085106382977, "step": 850}, {"loss": 1.7489, "grad_norm": 0.4462052583694458, "learning_rate": 0.0002, "epoch": 1.4638297872340424, "step": 860}, {"loss": 1.7513, "grad_norm": 0.42129236459732056, "learning_rate": 0.0002, "epoch": 1.4808510638297872, "step": 870}, {"loss": 1.6009, "grad_norm": 0.41489893198013306, "learning_rate": 0.0002, "epoch": 1.4978723404255319, "step": 880}, {"loss": 1.7129, "grad_norm": 0.41451677680015564, "learning_rate": 0.0002, "epoch": 1.5148936170212766, "step": 890}, {"loss": 1.6427, "grad_norm": 0.4477299749851227, "learning_rate": 0.0002, "epoch": 1.5319148936170213, "step": 900}, {"loss": 1.6543, "grad_norm": 0.38476648926734924, "learning_rate": 0.0002, "epoch": 1.548936170212766, "step": 910}, {"loss": 1.7103, "grad_norm": 0.42755743861198425, "learning_rate": 0.0002, "epoch": 1.5659574468085107, "step": 920}, {"loss": 1.7014, "grad_norm": 0.39372023940086365, "learning_rate": 0.0002, "epoch": 1.5829787234042554, "step": 930}, {"loss": 1.7042, "grad_norm": 0.42778754234313965, "learning_rate": 0.0002, "epoch": 1.6, "step": 940}, {"loss": 1.7488, "grad_norm": 0.4217268228530884, "learning_rate": 0.0002, "epoch": 1.6170212765957448, "step": 950}, {"loss": 1.6556, "grad_norm": 0.40452107787132263, "learning_rate": 0.0002, "epoch": 1.6340425531914895, "step": 960}, {"loss": 1.6734, "grad_norm": 0.4259980022907257, "learning_rate": 0.0002, "epoch": 1.6510638297872342, "step": 970}, {"loss": 1.6464, "grad_norm": 0.4089849591255188, "learning_rate": 0.0002, "epoch": 1.6680851063829787, "step": 980}, {"loss": 1.6588, "grad_norm": 0.38276049494743347, "learning_rate": 0.0002, "epoch": 1.6851063829787234, "step": 990}, {"loss": 1.6846, "grad_norm": 0.40361565351486206, "learning_rate": 0.0002, "epoch": 1.702127659574468, "step": 1000}, {"loss": 1.698, "grad_norm": 0.3537807762622833, "learning_rate": 0.0002, "epoch": 1.7191489361702128, "step": 1010}, {"loss": 1.6001, "grad_norm": 0.40288347005844116, "learning_rate": 0.0002, "epoch": 1.7361702127659573, "step": 1020}, {"loss": 1.6375, "grad_norm": 0.4003616273403168, "learning_rate": 0.0002, "epoch": 1.753191489361702, "step": 1030}, {"loss": 1.6067, "grad_norm": 0.3931669592857361, "learning_rate": 0.0002, "epoch": 1.7702127659574467, "step": 1040}, {"loss": 1.6814, "grad_norm": 0.4001635015010834, "learning_rate": 0.0002, "epoch": 1.7872340425531914, "step": 1050}, {"loss": 1.6158, "grad_norm": 0.4139048457145691, "learning_rate": 0.0002, "epoch": 1.804255319148936, "step": 1060}, {"loss": 1.7513, "grad_norm": 0.5044458508491516, "learning_rate": 0.0002, "epoch": 1.8212765957446808, "step": 1070}, {"loss": 1.7317, "grad_norm": 0.4827095568180084, "learning_rate": 0.0002, "epoch": 1.8382978723404255, "step": 1080}, {"loss": 1.6586, "grad_norm": 0.3750515282154083, "learning_rate": 0.0002, "epoch": 1.8553191489361702, "step": 1090}, {"loss": 1.653, "grad_norm": 0.4024597704410553, "learning_rate": 0.0002, "epoch": 1.872340425531915, "step": 1100}, {"loss": 1.7138, "grad_norm": 0.36747241020202637, "learning_rate": 0.0002, "epoch": 1.8893617021276596, "step": 1110}, {"loss": 1.652, "grad_norm": 0.41397711634635925, "learning_rate": 0.0002, "epoch": 1.9063829787234043, "step": 1120}, {"loss": 1.7071, "grad_norm": 0.3960763216018677, "learning_rate": 0.0002, "epoch": 1.923404255319149, "step": 1130}, {"loss": 1.6857, "grad_norm": 0.4533233344554901, "learning_rate": 0.0002, "epoch": 1.9404255319148938, "step": 1140}, {"loss": 1.7168, "grad_norm": 0.38433438539505005, "learning_rate": 0.0002, "epoch": 1.9574468085106385, "step": 1150}, {"loss": 1.7444, "grad_norm": 0.3648812174797058, "learning_rate": 0.0002, "epoch": 1.974468085106383, "step": 1160}, {"loss": 1.6521, "grad_norm": 0.3887176215648651, "learning_rate": 0.0002, "epoch": 1.9914893617021276, "step": 1170}, {"eval_loss": 1.8328146934509277, "eval_runtime": 107.2842, "eval_samples_per_second": 4.8, "eval_steps_per_second": 0.606, "epoch": 2.0, "step": 1175}, {"loss": 1.6184, "grad_norm": 0.40444880723953247, "learning_rate": 0.0002, "epoch": 2.008510638297872, "step": 1180}, {"loss": 1.5221, "grad_norm": 0.3997816741466522, "learning_rate": 0.0002, "epoch": 2.025531914893617, "step": 1190}, {"loss": 1.6022, "grad_norm": 0.4516718089580536, "learning_rate": 0.0002, "epoch": 2.0425531914893615, "step": 1200}, {"loss": 1.5458, "grad_norm": 0.6645553708076477, "learning_rate": 0.0002, "epoch": 2.0595744680851062, "step": 1210}, {"loss": 1.5985, "grad_norm": 0.4181990921497345, "learning_rate": 0.0002, "epoch": 2.076595744680851, "step": 1220}, {"loss": 1.5768, "grad_norm": 0.45681431889533997, "learning_rate": 0.0002, "epoch": 2.0936170212765957, "step": 1230}, {"loss": 1.5598, "grad_norm": 0.48914700746536255, "learning_rate": 0.0002, "epoch": 2.1106382978723404, "step": 1240}, {"loss": 1.6031, "grad_norm": 0.43265485763549805, "learning_rate": 0.0002, "epoch": 2.127659574468085, "step": 1250}, {"loss": 1.5812, "grad_norm": 0.4641207754611969, "learning_rate": 0.0002, "epoch": 2.1446808510638298, "step": 1260}, {"loss": 1.5686, "grad_norm": 0.4840783476829529, "learning_rate": 0.0002, "epoch": 2.1617021276595745, "step": 1270}, {"loss": 1.5969, "grad_norm": 0.4974595308303833, "learning_rate": 0.0002, "epoch": 2.178723404255319, "step": 1280}, {"loss": 1.5512, "grad_norm": 0.5133475661277771, "learning_rate": 0.0002, "epoch": 2.195744680851064, "step": 1290}, {"loss": 1.6467, "grad_norm": 0.5030052065849304, "learning_rate": 0.0002, "epoch": 2.2127659574468086, "step": 1300}, {"loss": 1.6845, "grad_norm": 0.46602481603622437, "learning_rate": 0.0002, "epoch": 2.2297872340425533, "step": 1310}, {"loss": 1.6132, "grad_norm": 0.43662378191947937, "learning_rate": 0.0002, "epoch": 2.246808510638298, "step": 1320}, {"loss": 1.6223, "grad_norm": 0.5137454867362976, "learning_rate": 0.0002, "epoch": 2.2638297872340427, "step": 1330}, {"loss": 1.5702, "grad_norm": 0.4750335216522217, "learning_rate": 0.0002, "epoch": 2.2808510638297874, "step": 1340}, {"loss": 1.6479, "grad_norm": 0.43691426515579224, "learning_rate": 0.0002, "epoch": 2.297872340425532, "step": 1350}, {"loss": 1.5561, "grad_norm": 0.49752047657966614, "learning_rate": 0.0002, "epoch": 2.3148936170212764, "step": 1360}, {"loss": 1.5789, "grad_norm": 0.45101815462112427, "learning_rate": 0.0002, "epoch": 2.331914893617021, "step": 1370}, {"loss": 1.5858, "grad_norm": 0.4427817761898041, "learning_rate": 0.0002, "epoch": 2.348936170212766, "step": 1380}, {"loss": 1.5537, "grad_norm": 0.4802311062812805, "learning_rate": 0.0002, "epoch": 2.3659574468085105, "step": 1390}, {"loss": 1.5846, "grad_norm": 0.4512513279914856, "learning_rate": 0.0002, "epoch": 2.382978723404255, "step": 1400}, {"loss": 1.627, "grad_norm": 0.4878857135772705, "learning_rate": 0.0002, "epoch": 2.4, "step": 1410}, {"loss": 1.5781, "grad_norm": 0.4741315543651581, "learning_rate": 0.0002, "epoch": 2.4170212765957446, "step": 1420}, {"loss": 1.5595, "grad_norm": 0.4770931601524353, "learning_rate": 0.0002, "epoch": 2.4340425531914893, "step": 1430}, {"loss": 1.5336, "grad_norm": 0.5124667286872864, "learning_rate": 0.0002, "epoch": 2.451063829787234, "step": 1440}, {"loss": 1.5811, "grad_norm": 0.45264801383018494, "learning_rate": 0.0002, "epoch": 2.4680851063829787, "step": 1450}, {"loss": 1.604, "grad_norm": 0.5456924438476562, "learning_rate": 0.0002, "epoch": 2.4851063829787234, "step": 1460}, {"loss": 1.6063, "grad_norm": 0.44656285643577576, "learning_rate": 0.0002, "epoch": 2.502127659574468, "step": 1470}, {"loss": 1.6336, "grad_norm": 0.5939419865608215, "learning_rate": 0.0002, "epoch": 2.519148936170213, "step": 1480}, {"loss": 1.5481, "grad_norm": 0.47853362560272217, "learning_rate": 0.0002, "epoch": 2.5361702127659576, "step": 1490}, {"loss": 1.6543, "grad_norm": 0.47643396258354187, "learning_rate": 0.0002, "epoch": 2.5531914893617023, "step": 1500}, {"loss": 1.5085, "grad_norm": 0.4939501881599426, "learning_rate": 0.0002, "epoch": 2.570212765957447, "step": 1510}, {"loss": 1.5659, "grad_norm": 0.502055287361145, "learning_rate": 0.0002, "epoch": 2.5872340425531917, "step": 1520}, {"loss": 1.6519, "grad_norm": 0.463250994682312, "learning_rate": 0.0002, "epoch": 2.604255319148936, "step": 1530}, {"loss": 1.5698, "grad_norm": 0.4761098623275757, "learning_rate": 0.0002, "epoch": 2.621276595744681, "step": 1540}, {"loss": 1.6044, "grad_norm": 0.4687299132347107, "learning_rate": 0.0002, "epoch": 2.6382978723404253, "step": 1550}, {"loss": 1.632, "grad_norm": 0.5536078810691833, "learning_rate": 0.0002, "epoch": 2.65531914893617, "step": 1560}, {"loss": 1.586, "grad_norm": 0.581320583820343, "learning_rate": 0.0002, "epoch": 2.6723404255319148, "step": 1570}, {"loss": 1.508, "grad_norm": 0.45952868461608887, "learning_rate": 0.0002, "epoch": 2.6893617021276595, "step": 1580}, {"loss": 1.5905, "grad_norm": 0.4602586328983307, "learning_rate": 0.0002, "epoch": 2.706382978723404, "step": 1590}, {"loss": 1.6008, "grad_norm": 0.5276554226875305, "learning_rate": 0.0002, "epoch": 2.723404255319149, "step": 1600}, {"loss": 1.7216, "grad_norm": 0.5750249624252319, "learning_rate": 0.0002, "epoch": 2.7404255319148936, "step": 1610}, {"loss": 1.5506, "grad_norm": 0.468723863363266, "learning_rate": 0.0002, "epoch": 2.7574468085106383, "step": 1620}, {"loss": 1.4499, "grad_norm": 0.44649943709373474, "learning_rate": 0.0002, "epoch": 2.774468085106383, "step": 1630}, {"loss": 1.5106, "grad_norm": 0.5097237825393677, "learning_rate": 0.0002, "epoch": 2.7914893617021277, "step": 1640}, {"loss": 1.5948, "grad_norm": 0.46384191513061523, "learning_rate": 0.0002, "epoch": 2.8085106382978724, "step": 1650}, {"loss": 1.5828, "grad_norm": 0.4885474443435669, "learning_rate": 0.0002, "epoch": 2.825531914893617, "step": 1660}, {"loss": 1.5675, "grad_norm": 0.45621681213378906, "learning_rate": 0.0002, "epoch": 2.842553191489362, "step": 1670}, {"loss": 1.6042, "grad_norm": 0.4797150194644928, "learning_rate": 0.0002, "epoch": 2.8595744680851065, "step": 1680}, {"loss": 1.5601, "grad_norm": 0.5142032504081726, "learning_rate": 0.0002, "epoch": 2.876595744680851, "step": 1690}, {"loss": 1.5984, "grad_norm": 0.48939862847328186, "learning_rate": 0.0002, "epoch": 2.8936170212765955, "step": 1700}, {"loss": 1.6333, "grad_norm": 0.4575578272342682, "learning_rate": 0.0002, "epoch": 2.9106382978723406, "step": 1710}, {"loss": 1.5396, "grad_norm": 0.5589063763618469, "learning_rate": 0.0002, "epoch": 2.927659574468085, "step": 1720}, {"loss": 1.6096, "grad_norm": 0.48508813977241516, "learning_rate": 0.0002, "epoch": 2.94468085106383, "step": 1730}, {"loss": 1.5686, "grad_norm": 0.42786726355552673, "learning_rate": 0.0002, "epoch": 2.9617021276595743, "step": 1740}, {"loss": 1.5555, "grad_norm": 0.5598229765892029, "learning_rate": 0.0002, "epoch": 2.978723404255319, "step": 1750}, {"loss": 1.5035, "grad_norm": 0.4779253602027893, "learning_rate": 0.0002, "epoch": 2.9957446808510637, "step": 1760}, {"eval_loss": 1.8543579578399658, "eval_runtime": 107.2363, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.606, "epoch": 2.999148936170213, "step": 1762}, {"loss": 1.4767, "grad_norm": 0.48810940980911255, "learning_rate": 0.0002, "epoch": 3.0127659574468084, "step": 1770}, {"loss": 1.5385, "grad_norm": 0.6194920539855957, "learning_rate": 0.0002, "epoch": 3.029787234042553, "step": 1780}, {"loss": 1.4012, "grad_norm": 0.5875462293624878, "learning_rate": 0.0002, "epoch": 3.046808510638298, "step": 1790}, {"loss": 1.4727, "grad_norm": 0.5775138139724731, "learning_rate": 0.0002, "epoch": 3.0638297872340425, "step": 1800}, {"loss": 1.493, "grad_norm": 0.5445981621742249, "learning_rate": 0.0002, "epoch": 3.0808510638297872, "step": 1810}, {"loss": 1.4247, "grad_norm": 0.6728862524032593, "learning_rate": 0.0002, "epoch": 3.097872340425532, "step": 1820}, {"loss": 1.4303, "grad_norm": 0.6105490326881409, "learning_rate": 0.0002, "epoch": 3.1148936170212767, "step": 1830}, {"loss": 1.5214, "grad_norm": 0.5771165490150452, "learning_rate": 0.0002, "epoch": 3.1319148936170214, "step": 1840}, {"loss": 1.4359, "grad_norm": 0.5778449773788452, "learning_rate": 0.0002, "epoch": 3.148936170212766, "step": 1850}, {"loss": 1.4121, "grad_norm": 0.7141990661621094, "learning_rate": 0.0002, "epoch": 3.1659574468085108, "step": 1860}, {"loss": 1.4904, "grad_norm": 0.5882705450057983, "learning_rate": 0.0002, "epoch": 3.1829787234042555, "step": 1870}, {"loss": 1.4941, "grad_norm": 0.5996195077896118, "learning_rate": 0.0002, "epoch": 3.2, "step": 1880}, {"loss": 1.4519, "grad_norm": 0.6121219396591187, "learning_rate": 0.0002, "epoch": 3.217021276595745, "step": 1890}, {"loss": 1.4586, "grad_norm": 0.6402981281280518, "learning_rate": 0.0002, "epoch": 3.2340425531914896, "step": 1900}, {"loss": 1.3766, "grad_norm": 0.6111783981323242, "learning_rate": 0.0002, "epoch": 3.251063829787234, "step": 1910}, {"loss": 1.4863, "grad_norm": 0.6682435274124146, "learning_rate": 0.0002, "epoch": 3.2680851063829786, "step": 1920}, {"loss": 1.4608, "grad_norm": 0.6530760526657104, "learning_rate": 0.0002, "epoch": 3.2851063829787233, "step": 1930}, {"loss": 1.4422, "grad_norm": 0.6481217741966248, "learning_rate": 0.0002, "epoch": 3.302127659574468, "step": 1940}, {"loss": 1.5158, "grad_norm": 0.6270697116851807, "learning_rate": 0.0002, "epoch": 3.3191489361702127, "step": 1950}, {"loss": 1.4116, "grad_norm": 0.5924492478370667, "learning_rate": 0.0002, "epoch": 3.3361702127659574, "step": 1960}, {"loss": 1.4578, "grad_norm": 0.5803806781768799, "learning_rate": 0.0002, "epoch": 3.353191489361702, "step": 1970}, {"loss": 1.4689, "grad_norm": 0.5754119157791138, "learning_rate": 0.0002, "epoch": 3.370212765957447, "step": 1980}, {"loss": 1.4605, "grad_norm": 0.6717178821563721, "learning_rate": 0.0002, "epoch": 3.3872340425531915, "step": 1990}, {"loss": 1.486, "grad_norm": 0.5955582857131958, "learning_rate": 0.0002, "epoch": 3.404255319148936, "step": 2000}, {"loss": 1.4445, "grad_norm": 0.6965329647064209, "learning_rate": 0.0002, "epoch": 3.421276595744681, "step": 2010}, {"loss": 1.4543, "grad_norm": 0.6321573257446289, "learning_rate": 0.0002, "epoch": 3.4382978723404256, "step": 2020}, {"loss": 1.5383, "grad_norm": 0.5952608585357666, "learning_rate": 0.0002, "epoch": 3.4553191489361703, "step": 2030}, {"loss": 1.4531, "grad_norm": 0.7718905806541443, "learning_rate": 0.0002, "epoch": 3.472340425531915, "step": 2040}, {"loss": 1.4678, "grad_norm": 0.6850892305374146, "learning_rate": 0.0002, "epoch": 3.4893617021276597, "step": 2050}, {"loss": 1.4956, "grad_norm": 0.5638895630836487, "learning_rate": 0.0002, "epoch": 3.506382978723404, "step": 2060}, {"loss": 1.4586, "grad_norm": 0.6148294806480408, "learning_rate": 0.0002, "epoch": 3.523404255319149, "step": 2070}, {"loss": 1.4622, "grad_norm": 0.5895810723304749, "learning_rate": 0.0002, "epoch": 3.5404255319148934, "step": 2080}, {"loss": 1.4341, "grad_norm": 0.6377319693565369, "learning_rate": 0.0002, "epoch": 3.5574468085106385, "step": 2090}, {"loss": 1.5056, "grad_norm": 0.6047691702842712, "learning_rate": 0.0002, "epoch": 3.574468085106383, "step": 2100}, {"loss": 1.4748, "grad_norm": 0.6049593687057495, "learning_rate": 0.0002, "epoch": 3.5914893617021275, "step": 2110}, {"loss": 1.391, "grad_norm": 0.6358312368392944, "learning_rate": 0.0002, "epoch": 3.608510638297872, "step": 2120}, {"loss": 1.4419, "grad_norm": 0.612119197845459, "learning_rate": 0.0002, "epoch": 3.625531914893617, "step": 2130}, {"loss": 1.438, "grad_norm": 0.6788054704666138, "learning_rate": 0.0002, "epoch": 3.6425531914893616, "step": 2140}, {"loss": 1.4295, "grad_norm": 0.6191043853759766, "learning_rate": 0.0002, "epoch": 3.6595744680851063, "step": 2150}, {"loss": 1.4383, "grad_norm": 0.6660051941871643, "learning_rate": 0.0002, "epoch": 3.676595744680851, "step": 2160}, {"loss": 1.4954, "grad_norm": 0.652692973613739, "learning_rate": 0.0002, "epoch": 3.6936170212765957, "step": 2170}, {"loss": 1.5245, "grad_norm": 0.6123467087745667, "learning_rate": 0.0002, "epoch": 3.7106382978723405, "step": 2180}, {"loss": 1.4686, "grad_norm": 0.640021562576294, "learning_rate": 0.0002, "epoch": 3.727659574468085, "step": 2190}, {"loss": 1.4277, "grad_norm": 0.6809179782867432, "learning_rate": 0.0002, "epoch": 3.74468085106383, "step": 2200}, {"loss": 1.4705, "grad_norm": 0.5978420376777649, "learning_rate": 0.0002, "epoch": 3.7617021276595746, "step": 2210}, {"loss": 1.5559, "grad_norm": 0.7038803100585938, "learning_rate": 0.0002, "epoch": 3.7787234042553193, "step": 2220}, {"loss": 1.4691, "grad_norm": 0.5324276089668274, "learning_rate": 0.0002, "epoch": 3.795744680851064, "step": 2230}, {"loss": 1.4696, "grad_norm": 0.6264132857322693, "learning_rate": 0.0002, "epoch": 3.8127659574468087, "step": 2240}, {"loss": 1.4856, "grad_norm": 0.6143888831138611, "learning_rate": 0.0002, "epoch": 3.829787234042553, "step": 2250}, {"loss": 1.535, "grad_norm": 0.6338503360748291, "learning_rate": 0.0002, "epoch": 3.846808510638298, "step": 2260}, {"loss": 1.456, "grad_norm": 0.556882381439209, "learning_rate": 0.0002, "epoch": 3.8638297872340424, "step": 2270}, {"loss": 1.4701, "grad_norm": 0.6323680281639099, "learning_rate": 0.0002, "epoch": 3.8808510638297875, "step": 2280}, {"loss": 1.5333, "grad_norm": 0.7105869054794312, "learning_rate": 0.0002, "epoch": 3.8978723404255318, "step": 2290}, {"loss": 1.4462, "grad_norm": 0.825415849685669, "learning_rate": 0.0002, "epoch": 3.9148936170212765, "step": 2300}, {"loss": 1.5023, "grad_norm": 0.6412091851234436, "learning_rate": 0.0002, "epoch": 3.931914893617021, "step": 2310}, {"loss": 1.3709, "grad_norm": 0.6286490559577942, "learning_rate": 0.0002, "epoch": 3.948936170212766, "step": 2320}, {"loss": 1.4693, "grad_norm": 0.636021077632904, "learning_rate": 0.0002, "epoch": 3.9659574468085106, "step": 2330}, {"loss": 1.4265, "grad_norm": 0.6032362580299377, "learning_rate": 0.0002, "epoch": 3.9829787234042553, "step": 2340}, {"loss": 1.377, "grad_norm": 0.6497282385826111, "learning_rate": 0.0002, "epoch": 4.0, "step": 2350}, {"eval_loss": 1.9081238508224487, "eval_runtime": 106.6404, "eval_samples_per_second": 4.829, "eval_steps_per_second": 0.61, "epoch": 4.0, "step": 2350}, {"loss": 1.317, "grad_norm": 0.6278848648071289, "learning_rate": 0.0002, "epoch": 4.017021276595744, "step": 2360}, {"loss": 1.3229, "grad_norm": 0.8259812593460083, "learning_rate": 0.0002, "epoch": 4.034042553191489, "step": 2370}, {"loss": 1.2776, "grad_norm": 0.7269589304924011, "learning_rate": 0.0002, "epoch": 4.051063829787234, "step": 2380}, {"loss": 1.3668, "grad_norm": 0.7460662126541138, "learning_rate": 0.0002, "epoch": 4.068085106382979, "step": 2390}, {"loss": 1.3096, "grad_norm": 1.2362046241760254, "learning_rate": 0.0002, "epoch": 4.085106382978723, "step": 2400}, {"loss": 1.2906, "grad_norm": 0.7699568867683411, "learning_rate": 0.0002, "epoch": 4.102127659574468, "step": 2410}, {"loss": 1.3005, "grad_norm": 0.8732489347457886, "learning_rate": 0.0002, "epoch": 4.1191489361702125, "step": 2420}, {"loss": 1.2741, "grad_norm": 0.8331889510154724, "learning_rate": 0.0002, "epoch": 4.136170212765958, "step": 2430}, {"loss": 1.1861, "grad_norm": 0.6686427593231201, "learning_rate": 0.0002, "epoch": 4.153191489361702, "step": 2440}, {"loss": 1.316, "grad_norm": 0.906380832195282, "learning_rate": 0.0002, "epoch": 4.170212765957447, "step": 2450}, {"loss": 1.3134, "grad_norm": 0.7269753813743591, "learning_rate": 0.0002, "epoch": 4.187234042553191, "step": 2460}, {"loss": 1.299, "grad_norm": 0.8556067943572998, "learning_rate": 0.0002, "epoch": 4.2042553191489365, "step": 2470}, {"loss": 1.2935, "grad_norm": 0.7076917886734009, "learning_rate": 0.0002, "epoch": 4.221276595744681, "step": 2480}, {"loss": 1.2608, "grad_norm": 0.7596837282180786, "learning_rate": 0.0002, "epoch": 4.238297872340426, "step": 2490}, {"loss": 1.2747, "grad_norm": 0.7790552377700806, "learning_rate": 0.0002, "epoch": 4.25531914893617, "step": 2500}, {"loss": 1.3438, "grad_norm": 0.8205534219741821, "learning_rate": 0.0002, "epoch": 4.272340425531915, "step": 2510}, {"loss": 1.3058, "grad_norm": 0.7892114520072937, "learning_rate": 0.0002, "epoch": 4.2893617021276595, "step": 2520}, {"loss": 1.3662, "grad_norm": 0.8907270431518555, "learning_rate": 0.0002, "epoch": 4.306382978723404, "step": 2530}, {"loss": 1.3168, "grad_norm": 0.821794331073761, "learning_rate": 0.0002, "epoch": 4.323404255319149, "step": 2540}, {"loss": 1.2467, "grad_norm": 0.7305247783660889, "learning_rate": 0.0002, "epoch": 4.340425531914893, "step": 2550}, {"loss": 1.3446, "grad_norm": 0.8639982342720032, "learning_rate": 0.0002, "epoch": 4.357446808510638, "step": 2560}, {"loss": 1.3863, "grad_norm": 0.8883494138717651, "learning_rate": 0.0002, "epoch": 4.374468085106383, "step": 2570}, {"loss": 1.3693, "grad_norm": 0.7611730098724365, "learning_rate": 0.0002, "epoch": 4.391489361702128, "step": 2580}, {"loss": 1.2814, "grad_norm": 0.7793022394180298, "learning_rate": 0.0002, "epoch": 4.408510638297872, "step": 2590}, {"loss": 1.3014, "grad_norm": 0.979060173034668, "learning_rate": 0.0002, "epoch": 4.425531914893617, "step": 2600}, {"loss": 1.3625, "grad_norm": 0.8320847749710083, "learning_rate": 0.0002, "epoch": 4.4425531914893615, "step": 2610}, {"loss": 1.3362, "grad_norm": 0.7481992244720459, "learning_rate": 0.0002, "epoch": 4.459574468085107, "step": 2620}, {"loss": 1.4037, "grad_norm": 0.783770740032196, "learning_rate": 0.0002, "epoch": 4.476595744680851, "step": 2630}, {"loss": 1.3049, "grad_norm": 0.773295521736145, "learning_rate": 0.0002, "epoch": 4.493617021276596, "step": 2640}, {"loss": 1.2739, "grad_norm": 0.9206840991973877, "learning_rate": 0.0002, "epoch": 4.51063829787234, "step": 2650}, {"loss": 1.3248, "grad_norm": 0.8803266882896423, "learning_rate": 0.0002, "epoch": 4.527659574468085, "step": 2660}, {"loss": 1.2898, "grad_norm": 0.9315535426139832, "learning_rate": 0.0002, "epoch": 4.54468085106383, "step": 2670}, {"loss": 1.316, "grad_norm": 0.8610678315162659, "learning_rate": 0.0002, "epoch": 4.561702127659575, "step": 2680}, {"loss": 1.2633, "grad_norm": 0.7405551671981812, "learning_rate": 0.0002, "epoch": 4.578723404255319, "step": 2690}, {"loss": 1.3136, "grad_norm": 1.0238394737243652, "learning_rate": 0.0002, "epoch": 4.595744680851064, "step": 2700}, {"loss": 1.4847, "grad_norm": 0.7814345955848694, "learning_rate": 0.0002, "epoch": 4.6127659574468085, "step": 2710}, {"loss": 1.295, "grad_norm": 0.8436329364776611, "learning_rate": 0.0002, "epoch": 4.629787234042553, "step": 2720}, {"loss": 1.3525, "grad_norm": 0.727214515209198, "learning_rate": 0.0002, "epoch": 4.646808510638298, "step": 2730}, {"loss": 1.3878, "grad_norm": 0.8465878367424011, "learning_rate": 0.0002, "epoch": 4.663829787234042, "step": 2740}, {"loss": 1.278, "grad_norm": 0.8218137621879578, "learning_rate": 0.0002, "epoch": 4.680851063829787, "step": 2750}, {"loss": 1.3628, "grad_norm": 0.7900442481040955, "learning_rate": 0.0002, "epoch": 4.697872340425532, "step": 2760}, {"loss": 1.3494, "grad_norm": 0.8214074969291687, "learning_rate": 0.0002, "epoch": 4.714893617021277, "step": 2770}, {"loss": 1.3954, "grad_norm": 0.7509574890136719, "learning_rate": 0.0002, "epoch": 4.731914893617021, "step": 2780}, {"loss": 1.3693, "grad_norm": 0.7416139245033264, "learning_rate": 0.0002, "epoch": 4.748936170212766, "step": 2790}, {"loss": 1.3045, "grad_norm": 0.8629977107048035, "learning_rate": 0.0002, "epoch": 4.76595744680851, "step": 2800}, {"loss": 1.3164, "grad_norm": 0.8056505918502808, "learning_rate": 0.0002, "epoch": 4.782978723404256, "step": 2810}, {"loss": 1.3056, "grad_norm": 0.7705401182174683, "learning_rate": 0.0002, "epoch": 4.8, "step": 2820}, {"loss": 1.3771, "grad_norm": 1.0173288583755493, "learning_rate": 0.0002, "epoch": 4.817021276595745, "step": 2830}, {"loss": 1.3494, "grad_norm": 0.8375823497772217, "learning_rate": 0.0002, "epoch": 4.834042553191489, "step": 2840}, {"loss": 1.3238, "grad_norm": 0.857073187828064, "learning_rate": 0.0002, "epoch": 4.851063829787234, "step": 2850}, {"loss": 1.2964, "grad_norm": 0.8672189712524414, "learning_rate": 0.0002, "epoch": 4.868085106382979, "step": 2860}, {"loss": 1.3646, "grad_norm": 0.8599910140037537, "learning_rate": 0.0002, "epoch": 4.885106382978723, "step": 2870}, {"loss": 1.3575, "grad_norm": 0.8844674229621887, "learning_rate": 0.0002, "epoch": 4.902127659574468, "step": 2880}, {"loss": 1.285, "grad_norm": 0.8246751427650452, "learning_rate": 0.0002, "epoch": 4.919148936170213, "step": 2890}, {"loss": 1.4116, "grad_norm": 0.8648163676261902, "learning_rate": 0.0002, "epoch": 4.9361702127659575, "step": 2900}, {"loss": 1.2614, "grad_norm": 0.9477900266647339, "learning_rate": 0.0002, "epoch": 4.953191489361702, "step": 2910}, {"loss": 1.3519, "grad_norm": 0.8047965168952942, "learning_rate": 0.0002, "epoch": 4.970212765957447, "step": 2920}, {"loss": 1.3889, "grad_norm": 0.9872494339942932, "learning_rate": 0.0002, "epoch": 4.987234042553191, "step": 2930}, {"eval_loss": 1.9836769104003906, "eval_runtime": 106.4655, "eval_samples_per_second": 4.837, "eval_steps_per_second": 0.611, "epoch": 4.999148936170212, "step": 2937}, {"loss": 1.2574, "grad_norm": 0.7292938828468323, "learning_rate": 0.0002, "epoch": 5.004255319148936, "step": 2940}, {"loss": 1.1312, "grad_norm": 0.8610548973083496, "learning_rate": 0.0002, "epoch": 5.0212765957446805, "step": 2950}, {"loss": 1.1105, "grad_norm": 0.8384576439857483, "learning_rate": 0.0002, "epoch": 5.038297872340426, "step": 2960}, {"loss": 1.1412, "grad_norm": 0.9746620059013367, "learning_rate": 0.0002, "epoch": 5.05531914893617, "step": 2970}, {"loss": 1.1687, "grad_norm": 0.8879048228263855, "learning_rate": 0.0002, "epoch": 5.072340425531915, "step": 2980}, {"loss": 1.1333, "grad_norm": 0.9006168246269226, "learning_rate": 0.0002, "epoch": 5.089361702127659, "step": 2990}, {"loss": 1.1659, "grad_norm": 0.9770249128341675, "learning_rate": 0.0002, "epoch": 5.1063829787234045, "step": 3000}, {"loss": 1.1334, "grad_norm": 1.267967939376831, "learning_rate": 0.0002, "epoch": 5.123404255319149, "step": 3010}, {"loss": 1.2095, "grad_norm": 0.9857587218284607, "learning_rate": 0.0002, "epoch": 5.140425531914894, "step": 3020}, {"loss": 1.0889, "grad_norm": 1.2938690185546875, "learning_rate": 0.0002, "epoch": 5.157446808510638, "step": 3030}, {"loss": 1.1645, "grad_norm": 0.8928244113922119, "learning_rate": 0.0002, "epoch": 5.174468085106383, "step": 3040}, {"loss": 1.1553, "grad_norm": 1.1087630987167358, "learning_rate": 0.0002, "epoch": 5.191489361702128, "step": 3050}, {"loss": 1.1416, "grad_norm": 0.9431360960006714, "learning_rate": 0.0002, "epoch": 5.208510638297873, "step": 3060}, {"loss": 1.1635, "grad_norm": 1.2048338651657104, "learning_rate": 0.0002, "epoch": 5.225531914893617, "step": 3070}, {"loss": 1.171, "grad_norm": 1.0017054080963135, "learning_rate": 0.0002, "epoch": 5.242553191489361, "step": 3080}, {"loss": 1.2212, "grad_norm": 1.2771434783935547, "learning_rate": 0.0002, "epoch": 5.259574468085106, "step": 3090}, {"loss": 1.1478, "grad_norm": 1.4307383298873901, "learning_rate": 0.0002, "epoch": 5.276595744680851, "step": 3100}, {"loss": 1.2132, "grad_norm": 1.2460752725601196, "learning_rate": 0.0002, "epoch": 5.293617021276596, "step": 3110}, {"loss": 1.235, "grad_norm": 1.693974494934082, "learning_rate": 0.0002, "epoch": 5.31063829787234, "step": 3120}, {"loss": 1.1961, "grad_norm": 0.9855408668518066, "learning_rate": 0.0002, "epoch": 5.327659574468085, "step": 3130}, {"loss": 1.2068, "grad_norm": 1.307521104812622, "learning_rate": 0.0002, "epoch": 5.3446808510638295, "step": 3140}, {"loss": 1.2144, "grad_norm": 0.957661509513855, "learning_rate": 0.0002, "epoch": 5.361702127659575, "step": 3150}, {"loss": 1.1305, "grad_norm": 0.870373010635376, "learning_rate": 0.0002, "epoch": 5.378723404255319, "step": 3160}, {"loss": 1.2196, "grad_norm": 0.9324309229850769, "learning_rate": 0.0002, "epoch": 5.395744680851064, "step": 3170}, {"loss": 1.1691, "grad_norm": 1.0142403841018677, "learning_rate": 0.0002, "epoch": 5.412765957446808, "step": 3180}, {"loss": 1.1788, "grad_norm": 0.9759578704833984, "learning_rate": 0.0002, "epoch": 5.4297872340425535, "step": 3190}, {"loss": 1.1321, "grad_norm": 0.9021993279457092, "learning_rate": 0.0002, "epoch": 5.446808510638298, "step": 3200}, {"loss": 1.2222, "grad_norm": 1.007728934288025, "learning_rate": 0.0002, "epoch": 5.463829787234043, "step": 3210}, {"loss": 1.1517, "grad_norm": 0.8969265222549438, "learning_rate": 0.0002, "epoch": 5.480851063829787, "step": 3220}, {"loss": 1.2061, "grad_norm": 0.9672483801841736, "learning_rate": 0.0002, "epoch": 5.497872340425532, "step": 3230}, {"loss": 1.1454, "grad_norm": 1.1417138576507568, "learning_rate": 0.0002, "epoch": 5.514893617021277, "step": 3240}, {"loss": 1.1871, "grad_norm": 0.9669530391693115, "learning_rate": 0.0002, "epoch": 5.531914893617021, "step": 3250}, {"loss": 1.1382, "grad_norm": 1.0161820650100708, "learning_rate": 0.0002, "epoch": 5.548936170212766, "step": 3260}, {"loss": 1.1708, "grad_norm": 0.9935774803161621, "learning_rate": 0.0002, "epoch": 5.565957446808511, "step": 3270}, {"loss": 1.1384, "grad_norm": 1.2572048902511597, "learning_rate": 0.0002, "epoch": 5.582978723404255, "step": 3280}, {"loss": 1.1711, "grad_norm": 0.9614662528038025, "learning_rate": 0.0002, "epoch": 5.6, "step": 3290}, {"loss": 1.219, "grad_norm": 0.9835584163665771, "learning_rate": 0.0002, "epoch": 5.617021276595745, "step": 3300}, {"loss": 1.2074, "grad_norm": 0.9387389421463013, "learning_rate": 0.0002, "epoch": 5.634042553191489, "step": 3310}, {"loss": 1.1148, "grad_norm": 0.9348428249359131, "learning_rate": 0.0002, "epoch": 5.651063829787234, "step": 3320}, {"loss": 1.2378, "grad_norm": 0.9636440873146057, "learning_rate": 0.0002, "epoch": 5.6680851063829785, "step": 3330}, {"loss": 1.2068, "grad_norm": 0.995894193649292, "learning_rate": 0.0002, "epoch": 5.685106382978724, "step": 3340}, {"loss": 1.1443, "grad_norm": 1.0357023477554321, "learning_rate": 0.0002, "epoch": 5.702127659574468, "step": 3350}, {"loss": 1.2209, "grad_norm": 1.0254428386688232, "learning_rate": 0.0002, "epoch": 5.719148936170213, "step": 3360}, {"loss": 1.1987, "grad_norm": 0.8993342518806458, "learning_rate": 0.0002, "epoch": 5.736170212765957, "step": 3370}, {"loss": 1.1527, "grad_norm": 0.9104585647583008, "learning_rate": 0.0002, "epoch": 5.753191489361702, "step": 3380}, {"loss": 1.2268, "grad_norm": 0.9555654525756836, "learning_rate": 0.0002, "epoch": 5.770212765957447, "step": 3390}, {"loss": 1.193, "grad_norm": 0.920124351978302, "learning_rate": 0.0002, "epoch": 5.787234042553192, "step": 3400}, {"loss": 1.2263, "grad_norm": 0.999706506729126, "learning_rate": 0.0002, "epoch": 5.804255319148936, "step": 3410}, {"loss": 1.1411, "grad_norm": 0.9292707443237305, "learning_rate": 0.0002, "epoch": 5.821276595744681, "step": 3420}, {"loss": 1.1507, "grad_norm": 1.0074706077575684, "learning_rate": 0.0002, "epoch": 5.8382978723404255, "step": 3430}, {"loss": 1.2709, "grad_norm": 1.0279479026794434, "learning_rate": 0.0002, "epoch": 5.85531914893617, "step": 3440}, {"loss": 1.1992, "grad_norm": 1.0026037693023682, "learning_rate": 0.0002, "epoch": 5.872340425531915, "step": 3450}, {"loss": 1.1416, "grad_norm": 1.0356525182724, "learning_rate": 0.0002, "epoch": 5.889361702127659, "step": 3460}, {"loss": 1.224, "grad_norm": 1.1106643676757812, "learning_rate": 0.0002, "epoch": 5.906382978723404, "step": 3470}, {"loss": 1.1955, "grad_norm": 0.9578408002853394, "learning_rate": 0.0002, "epoch": 5.923404255319149, "step": 3480}, {"loss": 1.2133, "grad_norm": 1.0225932598114014, "learning_rate": 0.0002, "epoch": 5.940425531914894, "step": 3490}, {"loss": 1.157, "grad_norm": 0.9677667021751404, "learning_rate": 0.0002, "epoch": 5.957446808510638, "step": 3500}, {"loss": 1.2196, "grad_norm": 1.0967241525650024, "learning_rate": 0.0002, "epoch": 5.974468085106383, "step": 3510}, {"loss": 1.1807, "grad_norm": 1.2497339248657227, "learning_rate": 0.0002, "epoch": 5.991489361702127, "step": 3520}, {"eval_loss": 2.0976572036743164, "eval_runtime": 105.9679, "eval_samples_per_second": 4.86, "eval_steps_per_second": 0.613, "epoch": 6.0, "step": 3525}, {"loss": 1.0827, "grad_norm": 0.9660930037498474, "learning_rate": 0.0002, "epoch": 6.008510638297873, "step": 3530}, {"loss": 1.0043, "grad_norm": 0.9462300539016724, "learning_rate": 0.0002, "epoch": 6.025531914893617, "step": 3540}, {"loss": 1.0102, "grad_norm": 0.9312542676925659, "learning_rate": 0.0002, "epoch": 6.042553191489362, "step": 3550}, {"loss": 1.0356, "grad_norm": 1.3502222299575806, "learning_rate": 0.0002, "epoch": 6.059574468085106, "step": 3560}, {"loss": 0.9167, "grad_norm": 1.2838709354400635, "learning_rate": 0.0002, "epoch": 6.076595744680851, "step": 3570}, {"loss": 0.9381, "grad_norm": 1.1399385929107666, "learning_rate": 0.0002, "epoch": 6.093617021276596, "step": 3580}, {"loss": 0.9416, "grad_norm": 1.1763123273849487, "learning_rate": 0.0002, "epoch": 6.110638297872341, "step": 3590}, {"loss": 0.9782, "grad_norm": 1.113002061843872, "learning_rate": 0.0002, "epoch": 6.127659574468085, "step": 3600}, {"loss": 0.9521, "grad_norm": 1.0322953462600708, "learning_rate": 0.0002, "epoch": 6.14468085106383, "step": 3610}, {"loss": 0.9114, "grad_norm": 1.2678894996643066, "learning_rate": 0.0002, "epoch": 6.1617021276595745, "step": 3620}, {"loss": 0.9934, "grad_norm": 1.2370864152908325, "learning_rate": 0.0002, "epoch": 6.178723404255319, "step": 3630}, {"loss": 0.9753, "grad_norm": 1.1930763721466064, "learning_rate": 0.0002, "epoch": 6.195744680851064, "step": 3640}, {"loss": 0.9448, "grad_norm": 1.3608582019805908, "learning_rate": 0.0002, "epoch": 6.212765957446808, "step": 3650}, {"loss": 1.0201, "grad_norm": 1.2158547639846802, "learning_rate": 0.0002, "epoch": 6.229787234042553, "step": 3660}, {"loss": 0.9896, "grad_norm": 1.1505420207977295, "learning_rate": 0.0002, "epoch": 6.246808510638298, "step": 3670}, {"loss": 1.0088, "grad_norm": 1.3038114309310913, "learning_rate": 0.0002, "epoch": 6.263829787234043, "step": 3680}, {"loss": 1.0416, "grad_norm": 1.3900057077407837, "learning_rate": 0.0002, "epoch": 6.280851063829787, "step": 3690}, {"loss": 0.9832, "grad_norm": 1.196964144706726, "learning_rate": 0.0002, "epoch": 6.297872340425532, "step": 3700}, {"loss": 1.0778, "grad_norm": 1.205865740776062, "learning_rate": 0.0002, "epoch": 6.314893617021276, "step": 3710}, {"loss": 1.0358, "grad_norm": 1.2710838317871094, "learning_rate": 0.0002, "epoch": 6.3319148936170215, "step": 3720}, {"loss": 1.0271, "grad_norm": 1.285942554473877, "learning_rate": 0.0002, "epoch": 6.348936170212766, "step": 3730}, {"loss": 1.0164, "grad_norm": 1.1717636585235596, "learning_rate": 0.0002, "epoch": 6.365957446808511, "step": 3740}, {"loss": 1.0557, "grad_norm": 1.190883994102478, "learning_rate": 0.0002, "epoch": 6.382978723404255, "step": 3750}, {"loss": 1.0319, "grad_norm": 1.1623435020446777, "learning_rate": 0.0002, "epoch": 6.4, "step": 3760}, {"loss": 1.0633, "grad_norm": 1.2285547256469727, "learning_rate": 0.0002, "epoch": 6.417021276595745, "step": 3770}, {"loss": 1.0593, "grad_norm": 1.1142666339874268, "learning_rate": 0.0002, "epoch": 6.43404255319149, "step": 3780}, {"loss": 1.0418, "grad_norm": 1.333337664604187, "learning_rate": 0.0002, "epoch": 6.451063829787234, "step": 3790}, {"loss": 1.0, "grad_norm": 1.350474238395691, "learning_rate": 0.0002, "epoch": 6.468085106382979, "step": 3800}, {"loss": 1.1152, "grad_norm": 1.2439061403274536, "learning_rate": 0.0002, "epoch": 6.485106382978723, "step": 3810}, {"loss": 1.0915, "grad_norm": 1.2488664388656616, "learning_rate": 0.0002, "epoch": 6.502127659574468, "step": 3820}, {"loss": 1.0571, "grad_norm": 1.1990735530853271, "learning_rate": 0.0002, "epoch": 6.519148936170213, "step": 3830}, {"loss": 0.9895, "grad_norm": 1.5180301666259766, "learning_rate": 0.0002, "epoch": 6.536170212765957, "step": 3840}, {"loss": 0.9955, "grad_norm": 1.1273280382156372, "learning_rate": 0.0002, "epoch": 6.553191489361702, "step": 3850}, {"loss": 1.0516, "grad_norm": 1.2778105735778809, "learning_rate": 0.0002, "epoch": 6.5702127659574465, "step": 3860}, {"loss": 1.0039, "grad_norm": 1.1789685487747192, "learning_rate": 0.0002, "epoch": 6.587234042553192, "step": 3870}, {"loss": 1.0381, "grad_norm": 1.2061398029327393, "learning_rate": 0.0002, "epoch": 6.604255319148936, "step": 3880}, {"loss": 1.0775, "grad_norm": 1.104092001914978, "learning_rate": 0.0002, "epoch": 6.621276595744681, "step": 3890}, {"loss": 1.0591, "grad_norm": 1.2648544311523438, "learning_rate": 0.0002, "epoch": 6.638297872340425, "step": 3900}, {"loss": 1.0535, "grad_norm": 1.2267687320709229, "learning_rate": 0.0002, "epoch": 6.6553191489361705, "step": 3910}, {"loss": 1.0654, "grad_norm": 1.3252530097961426, "learning_rate": 0.0002, "epoch": 6.672340425531915, "step": 3920}, {"loss": 1.0301, "grad_norm": 1.284563660621643, "learning_rate": 0.0002, "epoch": 6.68936170212766, "step": 3930}, {"loss": 1.102, "grad_norm": 1.293845534324646, "learning_rate": 0.0002, "epoch": 6.706382978723404, "step": 3940}, {"loss": 1.1526, "grad_norm": 1.2290467023849487, "learning_rate": 0.0002, "epoch": 6.723404255319149, "step": 3950}, {"loss": 1.0474, "grad_norm": 1.1712737083435059, "learning_rate": 0.0002, "epoch": 6.740425531914894, "step": 3960}, {"loss": 1.0149, "grad_norm": 1.1728616952896118, "learning_rate": 0.0002, "epoch": 6.757446808510638, "step": 3970}, {"loss": 1.0824, "grad_norm": 1.154922604560852, "learning_rate": 0.0002, "epoch": 6.774468085106383, "step": 3980}, {"loss": 1.0961, "grad_norm": 1.4673690795898438, "learning_rate": 0.0002, "epoch": 6.791489361702128, "step": 3990}, {"loss": 0.9784, "grad_norm": 1.2338067293167114, "learning_rate": 0.0002, "epoch": 6.808510638297872, "step": 4000}, {"loss": 1.0975, "grad_norm": 1.0775316953659058, "learning_rate": 0.0002, "epoch": 6.825531914893617, "step": 4010}, {"loss": 1.0204, "grad_norm": 1.2518454790115356, "learning_rate": 0.0002, "epoch": 6.842553191489362, "step": 4020}, {"loss": 1.1425, "grad_norm": 1.3534432649612427, "learning_rate": 0.0002, "epoch": 6.859574468085106, "step": 4030}, {"loss": 1.1212, "grad_norm": 1.1217902898788452, "learning_rate": 0.0002, "epoch": 6.876595744680851, "step": 4040}, {"loss": 1.0823, "grad_norm": 1.2672910690307617, "learning_rate": 0.0002, "epoch": 6.8936170212765955, "step": 4050}, {"loss": 1.0817, "grad_norm": 1.3807674646377563, "learning_rate": 0.0002, "epoch": 6.910638297872341, "step": 4060}, {"loss": 1.0576, "grad_norm": 1.064530849456787, "learning_rate": 0.0002, "epoch": 6.927659574468085, "step": 4070}, {"loss": 1.0718, "grad_norm": 1.1286897659301758, "learning_rate": 0.0002, "epoch": 6.94468085106383, "step": 4080}, {"loss": 1.0574, "grad_norm": 1.3736463785171509, "learning_rate": 0.0002, "epoch": 6.961702127659574, "step": 4090}, {"loss": 1.0621, "grad_norm": 1.3167431354522705, "learning_rate": 0.0002, "epoch": 6.9787234042553195, "step": 4100}, {"loss": 0.9754, "grad_norm": 1.2784067392349243, "learning_rate": 0.0002, "epoch": 6.995744680851064, "step": 4110}]} +{"epoch": 7.993191489361702, "step": 4696, "epoch_duration": 2033.8039877414703, "total_accumulated_duration": 16327.17719578743, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-10000/checkpoint-1175", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5996, "grad_norm": 0.7596228122711182, "learning_rate": 0.0002, "epoch": 0.01702127659574468, "step": 10}, {"loss": 2.2941, "grad_norm": 0.4860903322696686, "learning_rate": 0.0002, "epoch": 0.03404255319148936, "step": 20}, {"loss": 2.0018, "grad_norm": 0.4953401982784271, "learning_rate": 0.0002, "epoch": 0.05106382978723404, "step": 30}, {"loss": 1.9318, "grad_norm": 0.5086901783943176, "learning_rate": 0.0002, "epoch": 0.06808510638297872, "step": 40}, {"loss": 1.8519, "grad_norm": 0.49050021171569824, "learning_rate": 0.0002, "epoch": 0.0851063829787234, "step": 50}, {"loss": 1.8786, "grad_norm": 0.4922358989715576, "learning_rate": 0.0002, "epoch": 0.10212765957446808, "step": 60}, {"loss": 1.8812, "grad_norm": 0.4621541202068329, "learning_rate": 0.0002, "epoch": 0.11914893617021277, "step": 70}, {"loss": 1.8004, "grad_norm": 0.4416729807853699, "learning_rate": 0.0002, "epoch": 0.13617021276595745, "step": 80}, {"loss": 1.9298, "grad_norm": 0.526258111000061, "learning_rate": 0.0002, "epoch": 0.15319148936170213, "step": 90}, {"loss": 1.8339, "grad_norm": 0.44022637605667114, "learning_rate": 0.0002, "epoch": 0.1702127659574468, "step": 100}, {"loss": 1.7681, "grad_norm": 0.4647711515426636, "learning_rate": 0.0002, "epoch": 0.18723404255319148, "step": 110}, {"loss": 1.6953, "grad_norm": 0.4136318564414978, "learning_rate": 0.0002, "epoch": 0.20425531914893616, "step": 120}, {"loss": 1.8491, "grad_norm": 0.39707672595977783, "learning_rate": 0.0002, "epoch": 0.22127659574468084, "step": 130}, {"loss": 1.8017, "grad_norm": 0.4478105306625366, "learning_rate": 0.0002, "epoch": 0.23829787234042554, "step": 140}, {"loss": 1.8067, "grad_norm": 0.4699741303920746, "learning_rate": 0.0002, "epoch": 0.2553191489361702, "step": 150}, {"loss": 1.8161, "grad_norm": 0.4568363130092621, "learning_rate": 0.0002, "epoch": 0.2723404255319149, "step": 160}, {"loss": 1.7491, "grad_norm": 0.45078757405281067, "learning_rate": 0.0002, "epoch": 0.28936170212765955, "step": 170}, {"loss": 1.8115, "grad_norm": 0.4127245843410492, "learning_rate": 0.0002, "epoch": 0.30638297872340425, "step": 180}, {"loss": 1.7707, "grad_norm": 0.4042493402957916, "learning_rate": 0.0002, "epoch": 0.32340425531914896, "step": 190}, {"loss": 1.7807, "grad_norm": 0.401487797498703, "learning_rate": 0.0002, "epoch": 0.3404255319148936, "step": 200}, {"loss": 1.8355, "grad_norm": 0.3959457576274872, "learning_rate": 0.0002, "epoch": 0.3574468085106383, "step": 210}, {"loss": 1.7798, "grad_norm": 0.39865636825561523, "learning_rate": 0.0002, "epoch": 0.37446808510638296, "step": 220}, {"loss": 1.9012, "grad_norm": 0.7225169539451599, "learning_rate": 0.0002, "epoch": 0.39148936170212767, "step": 230}, {"loss": 1.7212, "grad_norm": 0.412801593542099, "learning_rate": 0.0002, "epoch": 0.4085106382978723, "step": 240}, {"loss": 1.8523, "grad_norm": 0.40951448678970337, "learning_rate": 0.0002, "epoch": 0.425531914893617, "step": 250}, {"loss": 1.7283, "grad_norm": 0.42788130044937134, "learning_rate": 0.0002, "epoch": 0.4425531914893617, "step": 260}, {"loss": 1.8051, "grad_norm": 0.41069576144218445, "learning_rate": 0.0002, "epoch": 0.4595744680851064, "step": 270}, {"loss": 1.7861, "grad_norm": 0.3745323717594147, "learning_rate": 0.0002, "epoch": 0.4765957446808511, "step": 280}, {"loss": 1.8484, "grad_norm": 0.3771323263645172, "learning_rate": 0.0002, "epoch": 0.49361702127659574, "step": 290}, {"loss": 1.7417, "grad_norm": 0.34368929266929626, "learning_rate": 0.0002, "epoch": 0.5106382978723404, "step": 300}, {"loss": 1.7394, "grad_norm": 0.4299296736717224, "learning_rate": 0.0002, "epoch": 0.5276595744680851, "step": 310}, {"loss": 1.8255, "grad_norm": 0.4133922755718231, "learning_rate": 0.0002, "epoch": 0.5446808510638298, "step": 320}, {"loss": 1.7043, "grad_norm": 0.3984859585762024, "learning_rate": 0.0002, "epoch": 0.5617021276595745, "step": 330}, {"loss": 1.7873, "grad_norm": 0.3822788894176483, "learning_rate": 0.0002, "epoch": 0.5787234042553191, "step": 340}, {"loss": 1.8082, "grad_norm": 0.4550061821937561, "learning_rate": 0.0002, "epoch": 0.5957446808510638, "step": 350}, {"loss": 1.7179, "grad_norm": 0.36571192741394043, "learning_rate": 0.0002, "epoch": 0.6127659574468085, "step": 360}, {"loss": 1.8196, "grad_norm": 0.32942914962768555, "learning_rate": 0.0002, "epoch": 0.6297872340425532, "step": 370}, {"loss": 1.7118, "grad_norm": 0.39299526810646057, "learning_rate": 0.0002, "epoch": 0.6468085106382979, "step": 380}, {"loss": 1.8179, "grad_norm": 0.3817657232284546, "learning_rate": 0.0002, "epoch": 0.6638297872340425, "step": 390}, {"loss": 1.8174, "grad_norm": 0.3650810122489929, "learning_rate": 0.0002, "epoch": 0.6808510638297872, "step": 400}, {"loss": 1.8026, "grad_norm": 0.3736686408519745, "learning_rate": 0.0002, "epoch": 0.6978723404255319, "step": 410}, {"loss": 1.802, "grad_norm": 0.45680564641952515, "learning_rate": 0.0002, "epoch": 0.7148936170212766, "step": 420}, {"loss": 1.7844, "grad_norm": 0.4154510200023651, "learning_rate": 0.0002, "epoch": 0.7319148936170212, "step": 430}, {"loss": 1.7801, "grad_norm": 0.3701167106628418, "learning_rate": 0.0002, "epoch": 0.7489361702127659, "step": 440}, {"loss": 1.7689, "grad_norm": 0.3869531750679016, "learning_rate": 0.0002, "epoch": 0.7659574468085106, "step": 450}, {"loss": 1.7671, "grad_norm": 0.4391495883464813, "learning_rate": 0.0002, "epoch": 0.7829787234042553, "step": 460}, {"loss": 1.7534, "grad_norm": 0.39652755856513977, "learning_rate": 0.0002, "epoch": 0.8, "step": 470}, {"loss": 1.8076, "grad_norm": 0.4096752107143402, "learning_rate": 0.0002, "epoch": 0.8170212765957446, "step": 480}, {"loss": 1.796, "grad_norm": 0.3857504427433014, "learning_rate": 0.0002, "epoch": 0.8340425531914893, "step": 490}, {"loss": 1.7379, "grad_norm": 0.4105374217033386, "learning_rate": 0.0002, "epoch": 0.851063829787234, "step": 500}, {"loss": 1.6391, "grad_norm": 0.3723328113555908, "learning_rate": 0.0002, "epoch": 0.8680851063829788, "step": 510}, {"loss": 1.7122, "grad_norm": 0.36099690198898315, "learning_rate": 0.0002, "epoch": 0.8851063829787233, "step": 520}, {"loss": 1.7632, "grad_norm": 0.3715187907218933, "learning_rate": 0.0002, "epoch": 0.902127659574468, "step": 530}, {"loss": 1.7004, "grad_norm": 0.4932813048362732, "learning_rate": 0.0002, "epoch": 0.9191489361702128, "step": 540}, {"loss": 1.679, "grad_norm": 0.3493495285511017, "learning_rate": 0.0002, "epoch": 0.9361702127659575, "step": 550}, {"loss": 1.7758, "grad_norm": 0.3598061800003052, "learning_rate": 0.0002, "epoch": 0.9531914893617022, "step": 560}, {"loss": 1.7686, "grad_norm": 0.3521560728549957, "learning_rate": 0.0002, "epoch": 0.9702127659574468, "step": 570}, {"loss": 1.7391, "grad_norm": 0.34150034189224243, "learning_rate": 0.0002, "epoch": 0.9872340425531915, "step": 580}, {"eval_loss": 1.8388911485671997, "eval_runtime": 106.6788, "eval_samples_per_second": 4.828, "eval_steps_per_second": 0.609, "epoch": 0.9991489361702127, "step": 587}, {"loss": 1.6115, "grad_norm": 0.33556103706359863, "learning_rate": 0.0002, "epoch": 1.004255319148936, "step": 590}, {"loss": 1.8346, "grad_norm": 0.4333398640155792, "learning_rate": 0.0002, "epoch": 1.0212765957446808, "step": 600}, {"loss": 1.6656, "grad_norm": 0.38488736748695374, "learning_rate": 0.0002, "epoch": 1.0382978723404255, "step": 610}, {"loss": 1.7778, "grad_norm": 0.44454529881477356, "learning_rate": 0.0002, "epoch": 1.0553191489361702, "step": 620}, {"loss": 1.7143, "grad_norm": 0.3735603392124176, "learning_rate": 0.0002, "epoch": 1.0723404255319149, "step": 630}, {"loss": 1.74, "grad_norm": 0.38912704586982727, "learning_rate": 0.0002, "epoch": 1.0893617021276596, "step": 640}, {"loss": 1.607, "grad_norm": 0.4411826431751251, "learning_rate": 0.0002, "epoch": 1.1063829787234043, "step": 650}, {"loss": 1.6901, "grad_norm": 0.4163050353527069, "learning_rate": 0.0002, "epoch": 1.123404255319149, "step": 660}, {"loss": 1.6722, "grad_norm": 0.4187192916870117, "learning_rate": 0.0002, "epoch": 1.1404255319148937, "step": 670}, {"loss": 1.7061, "grad_norm": 0.3797093629837036, "learning_rate": 0.0002, "epoch": 1.1574468085106382, "step": 680}, {"loss": 1.6409, "grad_norm": 0.4210026264190674, "learning_rate": 0.0002, "epoch": 1.174468085106383, "step": 690}, {"loss": 1.6343, "grad_norm": 0.4701998829841614, "learning_rate": 0.0002, "epoch": 1.1914893617021276, "step": 700}, {"loss": 1.6773, "grad_norm": 0.6331578493118286, "learning_rate": 0.0002, "epoch": 1.2085106382978723, "step": 710}, {"loss": 1.6776, "grad_norm": 0.41908255219459534, "learning_rate": 0.0002, "epoch": 1.225531914893617, "step": 720}, {"loss": 1.6468, "grad_norm": 0.36158403754234314, "learning_rate": 0.0002, "epoch": 1.2425531914893617, "step": 730}, {"loss": 1.746, "grad_norm": 0.387300580739975, "learning_rate": 0.0002, "epoch": 1.2595744680851064, "step": 740}, {"loss": 1.7089, "grad_norm": 0.38899728655815125, "learning_rate": 0.0002, "epoch": 1.2765957446808511, "step": 750}, {"loss": 1.6376, "grad_norm": 0.4549255073070526, "learning_rate": 0.0002, "epoch": 1.2936170212765958, "step": 760}, {"loss": 1.6754, "grad_norm": 0.4052349328994751, "learning_rate": 0.0002, "epoch": 1.3106382978723405, "step": 770}, {"loss": 1.6483, "grad_norm": 0.38934215903282166, "learning_rate": 0.0002, "epoch": 1.327659574468085, "step": 780}, {"loss": 1.7813, "grad_norm": 0.38688382506370544, "learning_rate": 0.0002, "epoch": 1.3446808510638297, "step": 790}, {"loss": 1.6374, "grad_norm": 0.3825705051422119, "learning_rate": 0.0002, "epoch": 1.3617021276595744, "step": 800}, {"loss": 1.655, "grad_norm": 0.37331756949424744, "learning_rate": 0.0002, "epoch": 1.3787234042553191, "step": 810}, {"loss": 1.6321, "grad_norm": 0.38826408982276917, "learning_rate": 0.0002, "epoch": 1.3957446808510638, "step": 820}, {"loss": 1.6107, "grad_norm": 0.4213569164276123, "learning_rate": 0.0002, "epoch": 1.4127659574468086, "step": 830}, {"loss": 1.6792, "grad_norm": 0.3976684808731079, "learning_rate": 0.0002, "epoch": 1.4297872340425533, "step": 840}, {"loss": 1.7029, "grad_norm": 0.390009343624115, "learning_rate": 0.0002, "epoch": 1.4468085106382977, "step": 850}, {"loss": 1.7489, "grad_norm": 0.4462052583694458, "learning_rate": 0.0002, "epoch": 1.4638297872340424, "step": 860}, {"loss": 1.7513, "grad_norm": 0.42129236459732056, "learning_rate": 0.0002, "epoch": 1.4808510638297872, "step": 870}, {"loss": 1.6009, "grad_norm": 0.41489893198013306, "learning_rate": 0.0002, "epoch": 1.4978723404255319, "step": 880}, {"loss": 1.7129, "grad_norm": 0.41451677680015564, "learning_rate": 0.0002, "epoch": 1.5148936170212766, "step": 890}, {"loss": 1.6427, "grad_norm": 0.4477299749851227, "learning_rate": 0.0002, "epoch": 1.5319148936170213, "step": 900}, {"loss": 1.6543, "grad_norm": 0.38476648926734924, "learning_rate": 0.0002, "epoch": 1.548936170212766, "step": 910}, {"loss": 1.7103, "grad_norm": 0.42755743861198425, "learning_rate": 0.0002, "epoch": 1.5659574468085107, "step": 920}, {"loss": 1.7014, "grad_norm": 0.39372023940086365, "learning_rate": 0.0002, "epoch": 1.5829787234042554, "step": 930}, {"loss": 1.7042, "grad_norm": 0.42778754234313965, "learning_rate": 0.0002, "epoch": 1.6, "step": 940}, {"loss": 1.7488, "grad_norm": 0.4217268228530884, "learning_rate": 0.0002, "epoch": 1.6170212765957448, "step": 950}, {"loss": 1.6556, "grad_norm": 0.40452107787132263, "learning_rate": 0.0002, "epoch": 1.6340425531914895, "step": 960}, {"loss": 1.6734, "grad_norm": 0.4259980022907257, "learning_rate": 0.0002, "epoch": 1.6510638297872342, "step": 970}, {"loss": 1.6464, "grad_norm": 0.4089849591255188, "learning_rate": 0.0002, "epoch": 1.6680851063829787, "step": 980}, {"loss": 1.6588, "grad_norm": 0.38276049494743347, "learning_rate": 0.0002, "epoch": 1.6851063829787234, "step": 990}, {"loss": 1.6846, "grad_norm": 0.40361565351486206, "learning_rate": 0.0002, "epoch": 1.702127659574468, "step": 1000}, {"loss": 1.698, "grad_norm": 0.3537807762622833, "learning_rate": 0.0002, "epoch": 1.7191489361702128, "step": 1010}, {"loss": 1.6001, "grad_norm": 0.40288347005844116, "learning_rate": 0.0002, "epoch": 1.7361702127659573, "step": 1020}, {"loss": 1.6375, "grad_norm": 0.4003616273403168, "learning_rate": 0.0002, "epoch": 1.753191489361702, "step": 1030}, {"loss": 1.6067, "grad_norm": 0.3931669592857361, "learning_rate": 0.0002, "epoch": 1.7702127659574467, "step": 1040}, {"loss": 1.6814, "grad_norm": 0.4001635015010834, "learning_rate": 0.0002, "epoch": 1.7872340425531914, "step": 1050}, {"loss": 1.6158, "grad_norm": 0.4139048457145691, "learning_rate": 0.0002, "epoch": 1.804255319148936, "step": 1060}, {"loss": 1.7513, "grad_norm": 0.5044458508491516, "learning_rate": 0.0002, "epoch": 1.8212765957446808, "step": 1070}, {"loss": 1.7317, "grad_norm": 0.4827095568180084, "learning_rate": 0.0002, "epoch": 1.8382978723404255, "step": 1080}, {"loss": 1.6586, "grad_norm": 0.3750515282154083, "learning_rate": 0.0002, "epoch": 1.8553191489361702, "step": 1090}, {"loss": 1.653, "grad_norm": 0.4024597704410553, "learning_rate": 0.0002, "epoch": 1.872340425531915, "step": 1100}, {"loss": 1.7138, "grad_norm": 0.36747241020202637, "learning_rate": 0.0002, "epoch": 1.8893617021276596, "step": 1110}, {"loss": 1.652, "grad_norm": 0.41397711634635925, "learning_rate": 0.0002, "epoch": 1.9063829787234043, "step": 1120}, {"loss": 1.7071, "grad_norm": 0.3960763216018677, "learning_rate": 0.0002, "epoch": 1.923404255319149, "step": 1130}, {"loss": 1.6857, "grad_norm": 0.4533233344554901, "learning_rate": 0.0002, "epoch": 1.9404255319148938, "step": 1140}, {"loss": 1.7168, "grad_norm": 0.38433438539505005, "learning_rate": 0.0002, "epoch": 1.9574468085106385, "step": 1150}, {"loss": 1.7444, "grad_norm": 0.3648812174797058, "learning_rate": 0.0002, "epoch": 1.974468085106383, "step": 1160}, {"loss": 1.6521, "grad_norm": 0.3887176215648651, "learning_rate": 0.0002, "epoch": 1.9914893617021276, "step": 1170}, {"eval_loss": 1.8328146934509277, "eval_runtime": 107.2842, "eval_samples_per_second": 4.8, "eval_steps_per_second": 0.606, "epoch": 2.0, "step": 1175}, {"loss": 1.6184, "grad_norm": 0.40444880723953247, "learning_rate": 0.0002, "epoch": 2.008510638297872, "step": 1180}, {"loss": 1.5221, "grad_norm": 0.3997816741466522, "learning_rate": 0.0002, "epoch": 2.025531914893617, "step": 1190}, {"loss": 1.6022, "grad_norm": 0.4516718089580536, "learning_rate": 0.0002, "epoch": 2.0425531914893615, "step": 1200}, {"loss": 1.5458, "grad_norm": 0.6645553708076477, "learning_rate": 0.0002, "epoch": 2.0595744680851062, "step": 1210}, {"loss": 1.5985, "grad_norm": 0.4181990921497345, "learning_rate": 0.0002, "epoch": 2.076595744680851, "step": 1220}, {"loss": 1.5768, "grad_norm": 0.45681431889533997, "learning_rate": 0.0002, "epoch": 2.0936170212765957, "step": 1230}, {"loss": 1.5598, "grad_norm": 0.48914700746536255, "learning_rate": 0.0002, "epoch": 2.1106382978723404, "step": 1240}, {"loss": 1.6031, "grad_norm": 0.43265485763549805, "learning_rate": 0.0002, "epoch": 2.127659574468085, "step": 1250}, {"loss": 1.5812, "grad_norm": 0.4641207754611969, "learning_rate": 0.0002, "epoch": 2.1446808510638298, "step": 1260}, {"loss": 1.5686, "grad_norm": 0.4840783476829529, "learning_rate": 0.0002, "epoch": 2.1617021276595745, "step": 1270}, {"loss": 1.5969, "grad_norm": 0.4974595308303833, "learning_rate": 0.0002, "epoch": 2.178723404255319, "step": 1280}, {"loss": 1.5512, "grad_norm": 0.5133475661277771, "learning_rate": 0.0002, "epoch": 2.195744680851064, "step": 1290}, {"loss": 1.6467, "grad_norm": 0.5030052065849304, "learning_rate": 0.0002, "epoch": 2.2127659574468086, "step": 1300}, {"loss": 1.6845, "grad_norm": 0.46602481603622437, "learning_rate": 0.0002, "epoch": 2.2297872340425533, "step": 1310}, {"loss": 1.6132, "grad_norm": 0.43662378191947937, "learning_rate": 0.0002, "epoch": 2.246808510638298, "step": 1320}, {"loss": 1.6223, "grad_norm": 0.5137454867362976, "learning_rate": 0.0002, "epoch": 2.2638297872340427, "step": 1330}, {"loss": 1.5702, "grad_norm": 0.4750335216522217, "learning_rate": 0.0002, "epoch": 2.2808510638297874, "step": 1340}, {"loss": 1.6479, "grad_norm": 0.43691426515579224, "learning_rate": 0.0002, "epoch": 2.297872340425532, "step": 1350}, {"loss": 1.5561, "grad_norm": 0.49752047657966614, "learning_rate": 0.0002, "epoch": 2.3148936170212764, "step": 1360}, {"loss": 1.5789, "grad_norm": 0.45101815462112427, "learning_rate": 0.0002, "epoch": 2.331914893617021, "step": 1370}, {"loss": 1.5858, "grad_norm": 0.4427817761898041, "learning_rate": 0.0002, "epoch": 2.348936170212766, "step": 1380}, {"loss": 1.5537, "grad_norm": 0.4802311062812805, "learning_rate": 0.0002, "epoch": 2.3659574468085105, "step": 1390}, {"loss": 1.5846, "grad_norm": 0.4512513279914856, "learning_rate": 0.0002, "epoch": 2.382978723404255, "step": 1400}, {"loss": 1.627, "grad_norm": 0.4878857135772705, "learning_rate": 0.0002, "epoch": 2.4, "step": 1410}, {"loss": 1.5781, "grad_norm": 0.4741315543651581, "learning_rate": 0.0002, "epoch": 2.4170212765957446, "step": 1420}, {"loss": 1.5595, "grad_norm": 0.4770931601524353, "learning_rate": 0.0002, "epoch": 2.4340425531914893, "step": 1430}, {"loss": 1.5336, "grad_norm": 0.5124667286872864, "learning_rate": 0.0002, "epoch": 2.451063829787234, "step": 1440}, {"loss": 1.5811, "grad_norm": 0.45264801383018494, "learning_rate": 0.0002, "epoch": 2.4680851063829787, "step": 1450}, {"loss": 1.604, "grad_norm": 0.5456924438476562, "learning_rate": 0.0002, "epoch": 2.4851063829787234, "step": 1460}, {"loss": 1.6063, "grad_norm": 0.44656285643577576, "learning_rate": 0.0002, "epoch": 2.502127659574468, "step": 1470}, {"loss": 1.6336, "grad_norm": 0.5939419865608215, "learning_rate": 0.0002, "epoch": 2.519148936170213, "step": 1480}, {"loss": 1.5481, "grad_norm": 0.47853362560272217, "learning_rate": 0.0002, "epoch": 2.5361702127659576, "step": 1490}, {"loss": 1.6543, "grad_norm": 0.47643396258354187, "learning_rate": 0.0002, "epoch": 2.5531914893617023, "step": 1500}, {"loss": 1.5085, "grad_norm": 0.4939501881599426, "learning_rate": 0.0002, "epoch": 2.570212765957447, "step": 1510}, {"loss": 1.5659, "grad_norm": 0.502055287361145, "learning_rate": 0.0002, "epoch": 2.5872340425531917, "step": 1520}, {"loss": 1.6519, "grad_norm": 0.463250994682312, "learning_rate": 0.0002, "epoch": 2.604255319148936, "step": 1530}, {"loss": 1.5698, "grad_norm": 0.4761098623275757, "learning_rate": 0.0002, "epoch": 2.621276595744681, "step": 1540}, {"loss": 1.6044, "grad_norm": 0.4687299132347107, "learning_rate": 0.0002, "epoch": 2.6382978723404253, "step": 1550}, {"loss": 1.632, "grad_norm": 0.5536078810691833, "learning_rate": 0.0002, "epoch": 2.65531914893617, "step": 1560}, {"loss": 1.586, "grad_norm": 0.581320583820343, "learning_rate": 0.0002, "epoch": 2.6723404255319148, "step": 1570}, {"loss": 1.508, "grad_norm": 0.45952868461608887, "learning_rate": 0.0002, "epoch": 2.6893617021276595, "step": 1580}, {"loss": 1.5905, "grad_norm": 0.4602586328983307, "learning_rate": 0.0002, "epoch": 2.706382978723404, "step": 1590}, {"loss": 1.6008, "grad_norm": 0.5276554226875305, "learning_rate": 0.0002, "epoch": 2.723404255319149, "step": 1600}, {"loss": 1.7216, "grad_norm": 0.5750249624252319, "learning_rate": 0.0002, "epoch": 2.7404255319148936, "step": 1610}, {"loss": 1.5506, "grad_norm": 0.468723863363266, "learning_rate": 0.0002, "epoch": 2.7574468085106383, "step": 1620}, {"loss": 1.4499, "grad_norm": 0.44649943709373474, "learning_rate": 0.0002, "epoch": 2.774468085106383, "step": 1630}, {"loss": 1.5106, "grad_norm": 0.5097237825393677, "learning_rate": 0.0002, "epoch": 2.7914893617021277, "step": 1640}, {"loss": 1.5948, "grad_norm": 0.46384191513061523, "learning_rate": 0.0002, "epoch": 2.8085106382978724, "step": 1650}, {"loss": 1.5828, "grad_norm": 0.4885474443435669, "learning_rate": 0.0002, "epoch": 2.825531914893617, "step": 1660}, {"loss": 1.5675, "grad_norm": 0.45621681213378906, "learning_rate": 0.0002, "epoch": 2.842553191489362, "step": 1670}, {"loss": 1.6042, "grad_norm": 0.4797150194644928, "learning_rate": 0.0002, "epoch": 2.8595744680851065, "step": 1680}, {"loss": 1.5601, "grad_norm": 0.5142032504081726, "learning_rate": 0.0002, "epoch": 2.876595744680851, "step": 1690}, {"loss": 1.5984, "grad_norm": 0.48939862847328186, "learning_rate": 0.0002, "epoch": 2.8936170212765955, "step": 1700}, {"loss": 1.6333, "grad_norm": 0.4575578272342682, "learning_rate": 0.0002, "epoch": 2.9106382978723406, "step": 1710}, {"loss": 1.5396, "grad_norm": 0.5589063763618469, "learning_rate": 0.0002, "epoch": 2.927659574468085, "step": 1720}, {"loss": 1.6096, "grad_norm": 0.48508813977241516, "learning_rate": 0.0002, "epoch": 2.94468085106383, "step": 1730}, {"loss": 1.5686, "grad_norm": 0.42786726355552673, "learning_rate": 0.0002, "epoch": 2.9617021276595743, "step": 1740}, {"loss": 1.5555, "grad_norm": 0.5598229765892029, "learning_rate": 0.0002, "epoch": 2.978723404255319, "step": 1750}, {"loss": 1.5035, "grad_norm": 0.4779253602027893, "learning_rate": 0.0002, "epoch": 2.9957446808510637, "step": 1760}, {"eval_loss": 1.8543579578399658, "eval_runtime": 107.2363, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.606, "epoch": 2.999148936170213, "step": 1762}, {"loss": 1.4767, "grad_norm": 0.48810940980911255, "learning_rate": 0.0002, "epoch": 3.0127659574468084, "step": 1770}, {"loss": 1.5385, "grad_norm": 0.6194920539855957, "learning_rate": 0.0002, "epoch": 3.029787234042553, "step": 1780}, {"loss": 1.4012, "grad_norm": 0.5875462293624878, "learning_rate": 0.0002, "epoch": 3.046808510638298, "step": 1790}, {"loss": 1.4727, "grad_norm": 0.5775138139724731, "learning_rate": 0.0002, "epoch": 3.0638297872340425, "step": 1800}, {"loss": 1.493, "grad_norm": 0.5445981621742249, "learning_rate": 0.0002, "epoch": 3.0808510638297872, "step": 1810}, {"loss": 1.4247, "grad_norm": 0.6728862524032593, "learning_rate": 0.0002, "epoch": 3.097872340425532, "step": 1820}, {"loss": 1.4303, "grad_norm": 0.6105490326881409, "learning_rate": 0.0002, "epoch": 3.1148936170212767, "step": 1830}, {"loss": 1.5214, "grad_norm": 0.5771165490150452, "learning_rate": 0.0002, "epoch": 3.1319148936170214, "step": 1840}, {"loss": 1.4359, "grad_norm": 0.5778449773788452, "learning_rate": 0.0002, "epoch": 3.148936170212766, "step": 1850}, {"loss": 1.4121, "grad_norm": 0.7141990661621094, "learning_rate": 0.0002, "epoch": 3.1659574468085108, "step": 1860}, {"loss": 1.4904, "grad_norm": 0.5882705450057983, "learning_rate": 0.0002, "epoch": 3.1829787234042555, "step": 1870}, {"loss": 1.4941, "grad_norm": 0.5996195077896118, "learning_rate": 0.0002, "epoch": 3.2, "step": 1880}, {"loss": 1.4519, "grad_norm": 0.6121219396591187, "learning_rate": 0.0002, "epoch": 3.217021276595745, "step": 1890}, {"loss": 1.4586, "grad_norm": 0.6402981281280518, "learning_rate": 0.0002, "epoch": 3.2340425531914896, "step": 1900}, {"loss": 1.3766, "grad_norm": 0.6111783981323242, "learning_rate": 0.0002, "epoch": 3.251063829787234, "step": 1910}, {"loss": 1.4863, "grad_norm": 0.6682435274124146, "learning_rate": 0.0002, "epoch": 3.2680851063829786, "step": 1920}, {"loss": 1.4608, "grad_norm": 0.6530760526657104, "learning_rate": 0.0002, "epoch": 3.2851063829787233, "step": 1930}, {"loss": 1.4422, "grad_norm": 0.6481217741966248, "learning_rate": 0.0002, "epoch": 3.302127659574468, "step": 1940}, {"loss": 1.5158, "grad_norm": 0.6270697116851807, "learning_rate": 0.0002, "epoch": 3.3191489361702127, "step": 1950}, {"loss": 1.4116, "grad_norm": 0.5924492478370667, "learning_rate": 0.0002, "epoch": 3.3361702127659574, "step": 1960}, {"loss": 1.4578, "grad_norm": 0.5803806781768799, "learning_rate": 0.0002, "epoch": 3.353191489361702, "step": 1970}, {"loss": 1.4689, "grad_norm": 0.5754119157791138, "learning_rate": 0.0002, "epoch": 3.370212765957447, "step": 1980}, {"loss": 1.4605, "grad_norm": 0.6717178821563721, "learning_rate": 0.0002, "epoch": 3.3872340425531915, "step": 1990}, {"loss": 1.486, "grad_norm": 0.5955582857131958, "learning_rate": 0.0002, "epoch": 3.404255319148936, "step": 2000}, {"loss": 1.4445, "grad_norm": 0.6965329647064209, "learning_rate": 0.0002, "epoch": 3.421276595744681, "step": 2010}, {"loss": 1.4543, "grad_norm": 0.6321573257446289, "learning_rate": 0.0002, "epoch": 3.4382978723404256, "step": 2020}, {"loss": 1.5383, "grad_norm": 0.5952608585357666, "learning_rate": 0.0002, "epoch": 3.4553191489361703, "step": 2030}, {"loss": 1.4531, "grad_norm": 0.7718905806541443, "learning_rate": 0.0002, "epoch": 3.472340425531915, "step": 2040}, {"loss": 1.4678, "grad_norm": 0.6850892305374146, "learning_rate": 0.0002, "epoch": 3.4893617021276597, "step": 2050}, {"loss": 1.4956, "grad_norm": 0.5638895630836487, "learning_rate": 0.0002, "epoch": 3.506382978723404, "step": 2060}, {"loss": 1.4586, "grad_norm": 0.6148294806480408, "learning_rate": 0.0002, "epoch": 3.523404255319149, "step": 2070}, {"loss": 1.4622, "grad_norm": 0.5895810723304749, "learning_rate": 0.0002, "epoch": 3.5404255319148934, "step": 2080}, {"loss": 1.4341, "grad_norm": 0.6377319693565369, "learning_rate": 0.0002, "epoch": 3.5574468085106385, "step": 2090}, {"loss": 1.5056, "grad_norm": 0.6047691702842712, "learning_rate": 0.0002, "epoch": 3.574468085106383, "step": 2100}, {"loss": 1.4748, "grad_norm": 0.6049593687057495, "learning_rate": 0.0002, "epoch": 3.5914893617021275, "step": 2110}, {"loss": 1.391, "grad_norm": 0.6358312368392944, "learning_rate": 0.0002, "epoch": 3.608510638297872, "step": 2120}, {"loss": 1.4419, "grad_norm": 0.612119197845459, "learning_rate": 0.0002, "epoch": 3.625531914893617, "step": 2130}, {"loss": 1.438, "grad_norm": 0.6788054704666138, "learning_rate": 0.0002, "epoch": 3.6425531914893616, "step": 2140}, {"loss": 1.4295, "grad_norm": 0.6191043853759766, "learning_rate": 0.0002, "epoch": 3.6595744680851063, "step": 2150}, {"loss": 1.4383, "grad_norm": 0.6660051941871643, "learning_rate": 0.0002, "epoch": 3.676595744680851, "step": 2160}, {"loss": 1.4954, "grad_norm": 0.652692973613739, "learning_rate": 0.0002, "epoch": 3.6936170212765957, "step": 2170}, {"loss": 1.5245, "grad_norm": 0.6123467087745667, "learning_rate": 0.0002, "epoch": 3.7106382978723405, "step": 2180}, {"loss": 1.4686, "grad_norm": 0.640021562576294, "learning_rate": 0.0002, "epoch": 3.727659574468085, "step": 2190}, {"loss": 1.4277, "grad_norm": 0.6809179782867432, "learning_rate": 0.0002, "epoch": 3.74468085106383, "step": 2200}, {"loss": 1.4705, "grad_norm": 0.5978420376777649, "learning_rate": 0.0002, "epoch": 3.7617021276595746, "step": 2210}, {"loss": 1.5559, "grad_norm": 0.7038803100585938, "learning_rate": 0.0002, "epoch": 3.7787234042553193, "step": 2220}, {"loss": 1.4691, "grad_norm": 0.5324276089668274, "learning_rate": 0.0002, "epoch": 3.795744680851064, "step": 2230}, {"loss": 1.4696, "grad_norm": 0.6264132857322693, "learning_rate": 0.0002, "epoch": 3.8127659574468087, "step": 2240}, {"loss": 1.4856, "grad_norm": 0.6143888831138611, "learning_rate": 0.0002, "epoch": 3.829787234042553, "step": 2250}, {"loss": 1.535, "grad_norm": 0.6338503360748291, "learning_rate": 0.0002, "epoch": 3.846808510638298, "step": 2260}, {"loss": 1.456, "grad_norm": 0.556882381439209, "learning_rate": 0.0002, "epoch": 3.8638297872340424, "step": 2270}, {"loss": 1.4701, "grad_norm": 0.6323680281639099, "learning_rate": 0.0002, "epoch": 3.8808510638297875, "step": 2280}, {"loss": 1.5333, "grad_norm": 0.7105869054794312, "learning_rate": 0.0002, "epoch": 3.8978723404255318, "step": 2290}, {"loss": 1.4462, "grad_norm": 0.825415849685669, "learning_rate": 0.0002, "epoch": 3.9148936170212765, "step": 2300}, {"loss": 1.5023, "grad_norm": 0.6412091851234436, "learning_rate": 0.0002, "epoch": 3.931914893617021, "step": 2310}, {"loss": 1.3709, "grad_norm": 0.6286490559577942, "learning_rate": 0.0002, "epoch": 3.948936170212766, "step": 2320}, {"loss": 1.4693, "grad_norm": 0.636021077632904, "learning_rate": 0.0002, "epoch": 3.9659574468085106, "step": 2330}, {"loss": 1.4265, "grad_norm": 0.6032362580299377, "learning_rate": 0.0002, "epoch": 3.9829787234042553, "step": 2340}, {"loss": 1.377, "grad_norm": 0.6497282385826111, "learning_rate": 0.0002, "epoch": 4.0, "step": 2350}, {"eval_loss": 1.9081238508224487, "eval_runtime": 106.6404, "eval_samples_per_second": 4.829, "eval_steps_per_second": 0.61, "epoch": 4.0, "step": 2350}, {"loss": 1.317, "grad_norm": 0.6278848648071289, "learning_rate": 0.0002, "epoch": 4.017021276595744, "step": 2360}, {"loss": 1.3229, "grad_norm": 0.8259812593460083, "learning_rate": 0.0002, "epoch": 4.034042553191489, "step": 2370}, {"loss": 1.2776, "grad_norm": 0.7269589304924011, "learning_rate": 0.0002, "epoch": 4.051063829787234, "step": 2380}, {"loss": 1.3668, "grad_norm": 0.7460662126541138, "learning_rate": 0.0002, "epoch": 4.068085106382979, "step": 2390}, {"loss": 1.3096, "grad_norm": 1.2362046241760254, "learning_rate": 0.0002, "epoch": 4.085106382978723, "step": 2400}, {"loss": 1.2906, "grad_norm": 0.7699568867683411, "learning_rate": 0.0002, "epoch": 4.102127659574468, "step": 2410}, {"loss": 1.3005, "grad_norm": 0.8732489347457886, "learning_rate": 0.0002, "epoch": 4.1191489361702125, "step": 2420}, {"loss": 1.2741, "grad_norm": 0.8331889510154724, "learning_rate": 0.0002, "epoch": 4.136170212765958, "step": 2430}, {"loss": 1.1861, "grad_norm": 0.6686427593231201, "learning_rate": 0.0002, "epoch": 4.153191489361702, "step": 2440}, {"loss": 1.316, "grad_norm": 0.906380832195282, "learning_rate": 0.0002, "epoch": 4.170212765957447, "step": 2450}, {"loss": 1.3134, "grad_norm": 0.7269753813743591, "learning_rate": 0.0002, "epoch": 4.187234042553191, "step": 2460}, {"loss": 1.299, "grad_norm": 0.8556067943572998, "learning_rate": 0.0002, "epoch": 4.2042553191489365, "step": 2470}, {"loss": 1.2935, "grad_norm": 0.7076917886734009, "learning_rate": 0.0002, "epoch": 4.221276595744681, "step": 2480}, {"loss": 1.2608, "grad_norm": 0.7596837282180786, "learning_rate": 0.0002, "epoch": 4.238297872340426, "step": 2490}, {"loss": 1.2747, "grad_norm": 0.7790552377700806, "learning_rate": 0.0002, "epoch": 4.25531914893617, "step": 2500}, {"loss": 1.3438, "grad_norm": 0.8205534219741821, "learning_rate": 0.0002, "epoch": 4.272340425531915, "step": 2510}, {"loss": 1.3058, "grad_norm": 0.7892114520072937, "learning_rate": 0.0002, "epoch": 4.2893617021276595, "step": 2520}, {"loss": 1.3662, "grad_norm": 0.8907270431518555, "learning_rate": 0.0002, "epoch": 4.306382978723404, "step": 2530}, {"loss": 1.3168, "grad_norm": 0.821794331073761, "learning_rate": 0.0002, "epoch": 4.323404255319149, "step": 2540}, {"loss": 1.2467, "grad_norm": 0.7305247783660889, "learning_rate": 0.0002, "epoch": 4.340425531914893, "step": 2550}, {"loss": 1.3446, "grad_norm": 0.8639982342720032, "learning_rate": 0.0002, "epoch": 4.357446808510638, "step": 2560}, {"loss": 1.3863, "grad_norm": 0.8883494138717651, "learning_rate": 0.0002, "epoch": 4.374468085106383, "step": 2570}, {"loss": 1.3693, "grad_norm": 0.7611730098724365, "learning_rate": 0.0002, "epoch": 4.391489361702128, "step": 2580}, {"loss": 1.2814, "grad_norm": 0.7793022394180298, "learning_rate": 0.0002, "epoch": 4.408510638297872, "step": 2590}, {"loss": 1.3014, "grad_norm": 0.979060173034668, "learning_rate": 0.0002, "epoch": 4.425531914893617, "step": 2600}, {"loss": 1.3625, "grad_norm": 0.8320847749710083, "learning_rate": 0.0002, "epoch": 4.4425531914893615, "step": 2610}, {"loss": 1.3362, "grad_norm": 0.7481992244720459, "learning_rate": 0.0002, "epoch": 4.459574468085107, "step": 2620}, {"loss": 1.4037, "grad_norm": 0.783770740032196, "learning_rate": 0.0002, "epoch": 4.476595744680851, "step": 2630}, {"loss": 1.3049, "grad_norm": 0.773295521736145, "learning_rate": 0.0002, "epoch": 4.493617021276596, "step": 2640}, {"loss": 1.2739, "grad_norm": 0.9206840991973877, "learning_rate": 0.0002, "epoch": 4.51063829787234, "step": 2650}, {"loss": 1.3248, "grad_norm": 0.8803266882896423, "learning_rate": 0.0002, "epoch": 4.527659574468085, "step": 2660}, {"loss": 1.2898, "grad_norm": 0.9315535426139832, "learning_rate": 0.0002, "epoch": 4.54468085106383, "step": 2670}, {"loss": 1.316, "grad_norm": 0.8610678315162659, "learning_rate": 0.0002, "epoch": 4.561702127659575, "step": 2680}, {"loss": 1.2633, "grad_norm": 0.7405551671981812, "learning_rate": 0.0002, "epoch": 4.578723404255319, "step": 2690}, {"loss": 1.3136, "grad_norm": 1.0238394737243652, "learning_rate": 0.0002, "epoch": 4.595744680851064, "step": 2700}, {"loss": 1.4847, "grad_norm": 0.7814345955848694, "learning_rate": 0.0002, "epoch": 4.6127659574468085, "step": 2710}, {"loss": 1.295, "grad_norm": 0.8436329364776611, "learning_rate": 0.0002, "epoch": 4.629787234042553, "step": 2720}, {"loss": 1.3525, "grad_norm": 0.727214515209198, "learning_rate": 0.0002, "epoch": 4.646808510638298, "step": 2730}, {"loss": 1.3878, "grad_norm": 0.8465878367424011, "learning_rate": 0.0002, "epoch": 4.663829787234042, "step": 2740}, {"loss": 1.278, "grad_norm": 0.8218137621879578, "learning_rate": 0.0002, "epoch": 4.680851063829787, "step": 2750}, {"loss": 1.3628, "grad_norm": 0.7900442481040955, "learning_rate": 0.0002, "epoch": 4.697872340425532, "step": 2760}, {"loss": 1.3494, "grad_norm": 0.8214074969291687, "learning_rate": 0.0002, "epoch": 4.714893617021277, "step": 2770}, {"loss": 1.3954, "grad_norm": 0.7509574890136719, "learning_rate": 0.0002, "epoch": 4.731914893617021, "step": 2780}, {"loss": 1.3693, "grad_norm": 0.7416139245033264, "learning_rate": 0.0002, "epoch": 4.748936170212766, "step": 2790}, {"loss": 1.3045, "grad_norm": 0.8629977107048035, "learning_rate": 0.0002, "epoch": 4.76595744680851, "step": 2800}, {"loss": 1.3164, "grad_norm": 0.8056505918502808, "learning_rate": 0.0002, "epoch": 4.782978723404256, "step": 2810}, {"loss": 1.3056, "grad_norm": 0.7705401182174683, "learning_rate": 0.0002, "epoch": 4.8, "step": 2820}, {"loss": 1.3771, "grad_norm": 1.0173288583755493, "learning_rate": 0.0002, "epoch": 4.817021276595745, "step": 2830}, {"loss": 1.3494, "grad_norm": 0.8375823497772217, "learning_rate": 0.0002, "epoch": 4.834042553191489, "step": 2840}, {"loss": 1.3238, "grad_norm": 0.857073187828064, "learning_rate": 0.0002, "epoch": 4.851063829787234, "step": 2850}, {"loss": 1.2964, "grad_norm": 0.8672189712524414, "learning_rate": 0.0002, "epoch": 4.868085106382979, "step": 2860}, {"loss": 1.3646, "grad_norm": 0.8599910140037537, "learning_rate": 0.0002, "epoch": 4.885106382978723, "step": 2870}, {"loss": 1.3575, "grad_norm": 0.8844674229621887, "learning_rate": 0.0002, "epoch": 4.902127659574468, "step": 2880}, {"loss": 1.285, "grad_norm": 0.8246751427650452, "learning_rate": 0.0002, "epoch": 4.919148936170213, "step": 2890}, {"loss": 1.4116, "grad_norm": 0.8648163676261902, "learning_rate": 0.0002, "epoch": 4.9361702127659575, "step": 2900}, {"loss": 1.2614, "grad_norm": 0.9477900266647339, "learning_rate": 0.0002, "epoch": 4.953191489361702, "step": 2910}, {"loss": 1.3519, "grad_norm": 0.8047965168952942, "learning_rate": 0.0002, "epoch": 4.970212765957447, "step": 2920}, {"loss": 1.3889, "grad_norm": 0.9872494339942932, "learning_rate": 0.0002, "epoch": 4.987234042553191, "step": 2930}, {"eval_loss": 1.9836769104003906, "eval_runtime": 106.4655, "eval_samples_per_second": 4.837, "eval_steps_per_second": 0.611, "epoch": 4.999148936170212, "step": 2937}, {"loss": 1.2574, "grad_norm": 0.7292938828468323, "learning_rate": 0.0002, "epoch": 5.004255319148936, "step": 2940}, {"loss": 1.1312, "grad_norm": 0.8610548973083496, "learning_rate": 0.0002, "epoch": 5.0212765957446805, "step": 2950}, {"loss": 1.1105, "grad_norm": 0.8384576439857483, "learning_rate": 0.0002, "epoch": 5.038297872340426, "step": 2960}, {"loss": 1.1412, "grad_norm": 0.9746620059013367, "learning_rate": 0.0002, "epoch": 5.05531914893617, "step": 2970}, {"loss": 1.1687, "grad_norm": 0.8879048228263855, "learning_rate": 0.0002, "epoch": 5.072340425531915, "step": 2980}, {"loss": 1.1333, "grad_norm": 0.9006168246269226, "learning_rate": 0.0002, "epoch": 5.089361702127659, "step": 2990}, {"loss": 1.1659, "grad_norm": 0.9770249128341675, "learning_rate": 0.0002, "epoch": 5.1063829787234045, "step": 3000}, {"loss": 1.1334, "grad_norm": 1.267967939376831, "learning_rate": 0.0002, "epoch": 5.123404255319149, "step": 3010}, {"loss": 1.2095, "grad_norm": 0.9857587218284607, "learning_rate": 0.0002, "epoch": 5.140425531914894, "step": 3020}, {"loss": 1.0889, "grad_norm": 1.2938690185546875, "learning_rate": 0.0002, "epoch": 5.157446808510638, "step": 3030}, {"loss": 1.1645, "grad_norm": 0.8928244113922119, "learning_rate": 0.0002, "epoch": 5.174468085106383, "step": 3040}, {"loss": 1.1553, "grad_norm": 1.1087630987167358, "learning_rate": 0.0002, "epoch": 5.191489361702128, "step": 3050}, {"loss": 1.1416, "grad_norm": 0.9431360960006714, "learning_rate": 0.0002, "epoch": 5.208510638297873, "step": 3060}, {"loss": 1.1635, "grad_norm": 1.2048338651657104, "learning_rate": 0.0002, "epoch": 5.225531914893617, "step": 3070}, {"loss": 1.171, "grad_norm": 1.0017054080963135, "learning_rate": 0.0002, "epoch": 5.242553191489361, "step": 3080}, {"loss": 1.2212, "grad_norm": 1.2771434783935547, "learning_rate": 0.0002, "epoch": 5.259574468085106, "step": 3090}, {"loss": 1.1478, "grad_norm": 1.4307383298873901, "learning_rate": 0.0002, "epoch": 5.276595744680851, "step": 3100}, {"loss": 1.2132, "grad_norm": 1.2460752725601196, "learning_rate": 0.0002, "epoch": 5.293617021276596, "step": 3110}, {"loss": 1.235, "grad_norm": 1.693974494934082, "learning_rate": 0.0002, "epoch": 5.31063829787234, "step": 3120}, {"loss": 1.1961, "grad_norm": 0.9855408668518066, "learning_rate": 0.0002, "epoch": 5.327659574468085, "step": 3130}, {"loss": 1.2068, "grad_norm": 1.307521104812622, "learning_rate": 0.0002, "epoch": 5.3446808510638295, "step": 3140}, {"loss": 1.2144, "grad_norm": 0.957661509513855, "learning_rate": 0.0002, "epoch": 5.361702127659575, "step": 3150}, {"loss": 1.1305, "grad_norm": 0.870373010635376, "learning_rate": 0.0002, "epoch": 5.378723404255319, "step": 3160}, {"loss": 1.2196, "grad_norm": 0.9324309229850769, "learning_rate": 0.0002, "epoch": 5.395744680851064, "step": 3170}, {"loss": 1.1691, "grad_norm": 1.0142403841018677, "learning_rate": 0.0002, "epoch": 5.412765957446808, "step": 3180}, {"loss": 1.1788, "grad_norm": 0.9759578704833984, "learning_rate": 0.0002, "epoch": 5.4297872340425535, "step": 3190}, {"loss": 1.1321, "grad_norm": 0.9021993279457092, "learning_rate": 0.0002, "epoch": 5.446808510638298, "step": 3200}, {"loss": 1.2222, "grad_norm": 1.007728934288025, "learning_rate": 0.0002, "epoch": 5.463829787234043, "step": 3210}, {"loss": 1.1517, "grad_norm": 0.8969265222549438, "learning_rate": 0.0002, "epoch": 5.480851063829787, "step": 3220}, {"loss": 1.2061, "grad_norm": 0.9672483801841736, "learning_rate": 0.0002, "epoch": 5.497872340425532, "step": 3230}, {"loss": 1.1454, "grad_norm": 1.1417138576507568, "learning_rate": 0.0002, "epoch": 5.514893617021277, "step": 3240}, {"loss": 1.1871, "grad_norm": 0.9669530391693115, "learning_rate": 0.0002, "epoch": 5.531914893617021, "step": 3250}, {"loss": 1.1382, "grad_norm": 1.0161820650100708, "learning_rate": 0.0002, "epoch": 5.548936170212766, "step": 3260}, {"loss": 1.1708, "grad_norm": 0.9935774803161621, "learning_rate": 0.0002, "epoch": 5.565957446808511, "step": 3270}, {"loss": 1.1384, "grad_norm": 1.2572048902511597, "learning_rate": 0.0002, "epoch": 5.582978723404255, "step": 3280}, {"loss": 1.1711, "grad_norm": 0.9614662528038025, "learning_rate": 0.0002, "epoch": 5.6, "step": 3290}, {"loss": 1.219, "grad_norm": 0.9835584163665771, "learning_rate": 0.0002, "epoch": 5.617021276595745, "step": 3300}, {"loss": 1.2074, "grad_norm": 0.9387389421463013, "learning_rate": 0.0002, "epoch": 5.634042553191489, "step": 3310}, {"loss": 1.1148, "grad_norm": 0.9348428249359131, "learning_rate": 0.0002, "epoch": 5.651063829787234, "step": 3320}, {"loss": 1.2378, "grad_norm": 0.9636440873146057, "learning_rate": 0.0002, "epoch": 5.6680851063829785, "step": 3330}, {"loss": 1.2068, "grad_norm": 0.995894193649292, "learning_rate": 0.0002, "epoch": 5.685106382978724, "step": 3340}, {"loss": 1.1443, "grad_norm": 1.0357023477554321, "learning_rate": 0.0002, "epoch": 5.702127659574468, "step": 3350}, {"loss": 1.2209, "grad_norm": 1.0254428386688232, "learning_rate": 0.0002, "epoch": 5.719148936170213, "step": 3360}, {"loss": 1.1987, "grad_norm": 0.8993342518806458, "learning_rate": 0.0002, "epoch": 5.736170212765957, "step": 3370}, {"loss": 1.1527, "grad_norm": 0.9104585647583008, "learning_rate": 0.0002, "epoch": 5.753191489361702, "step": 3380}, {"loss": 1.2268, "grad_norm": 0.9555654525756836, "learning_rate": 0.0002, "epoch": 5.770212765957447, "step": 3390}, {"loss": 1.193, "grad_norm": 0.920124351978302, "learning_rate": 0.0002, "epoch": 5.787234042553192, "step": 3400}, {"loss": 1.2263, "grad_norm": 0.999706506729126, "learning_rate": 0.0002, "epoch": 5.804255319148936, "step": 3410}, {"loss": 1.1411, "grad_norm": 0.9292707443237305, "learning_rate": 0.0002, "epoch": 5.821276595744681, "step": 3420}, {"loss": 1.1507, "grad_norm": 1.0074706077575684, "learning_rate": 0.0002, "epoch": 5.8382978723404255, "step": 3430}, {"loss": 1.2709, "grad_norm": 1.0279479026794434, "learning_rate": 0.0002, "epoch": 5.85531914893617, "step": 3440}, {"loss": 1.1992, "grad_norm": 1.0026037693023682, "learning_rate": 0.0002, "epoch": 5.872340425531915, "step": 3450}, {"loss": 1.1416, "grad_norm": 1.0356525182724, "learning_rate": 0.0002, "epoch": 5.889361702127659, "step": 3460}, {"loss": 1.224, "grad_norm": 1.1106643676757812, "learning_rate": 0.0002, "epoch": 5.906382978723404, "step": 3470}, {"loss": 1.1955, "grad_norm": 0.9578408002853394, "learning_rate": 0.0002, "epoch": 5.923404255319149, "step": 3480}, {"loss": 1.2133, "grad_norm": 1.0225932598114014, "learning_rate": 0.0002, "epoch": 5.940425531914894, "step": 3490}, {"loss": 1.157, "grad_norm": 0.9677667021751404, "learning_rate": 0.0002, "epoch": 5.957446808510638, "step": 3500}, {"loss": 1.2196, "grad_norm": 1.0967241525650024, "learning_rate": 0.0002, "epoch": 5.974468085106383, "step": 3510}, {"loss": 1.1807, "grad_norm": 1.2497339248657227, "learning_rate": 0.0002, "epoch": 5.991489361702127, "step": 3520}, {"eval_loss": 2.0976572036743164, "eval_runtime": 105.9679, "eval_samples_per_second": 4.86, "eval_steps_per_second": 0.613, "epoch": 6.0, "step": 3525}, {"loss": 1.0827, "grad_norm": 0.9660930037498474, "learning_rate": 0.0002, "epoch": 6.008510638297873, "step": 3530}, {"loss": 1.0043, "grad_norm": 0.9462300539016724, "learning_rate": 0.0002, "epoch": 6.025531914893617, "step": 3540}, {"loss": 1.0102, "grad_norm": 0.9312542676925659, "learning_rate": 0.0002, "epoch": 6.042553191489362, "step": 3550}, {"loss": 1.0356, "grad_norm": 1.3502222299575806, "learning_rate": 0.0002, "epoch": 6.059574468085106, "step": 3560}, {"loss": 0.9167, "grad_norm": 1.2838709354400635, "learning_rate": 0.0002, "epoch": 6.076595744680851, "step": 3570}, {"loss": 0.9381, "grad_norm": 1.1399385929107666, "learning_rate": 0.0002, "epoch": 6.093617021276596, "step": 3580}, {"loss": 0.9416, "grad_norm": 1.1763123273849487, "learning_rate": 0.0002, "epoch": 6.110638297872341, "step": 3590}, {"loss": 0.9782, "grad_norm": 1.113002061843872, "learning_rate": 0.0002, "epoch": 6.127659574468085, "step": 3600}, {"loss": 0.9521, "grad_norm": 1.0322953462600708, "learning_rate": 0.0002, "epoch": 6.14468085106383, "step": 3610}, {"loss": 0.9114, "grad_norm": 1.2678894996643066, "learning_rate": 0.0002, "epoch": 6.1617021276595745, "step": 3620}, {"loss": 0.9934, "grad_norm": 1.2370864152908325, "learning_rate": 0.0002, "epoch": 6.178723404255319, "step": 3630}, {"loss": 0.9753, "grad_norm": 1.1930763721466064, "learning_rate": 0.0002, "epoch": 6.195744680851064, "step": 3640}, {"loss": 0.9448, "grad_norm": 1.3608582019805908, "learning_rate": 0.0002, "epoch": 6.212765957446808, "step": 3650}, {"loss": 1.0201, "grad_norm": 1.2158547639846802, "learning_rate": 0.0002, "epoch": 6.229787234042553, "step": 3660}, {"loss": 0.9896, "grad_norm": 1.1505420207977295, "learning_rate": 0.0002, "epoch": 6.246808510638298, "step": 3670}, {"loss": 1.0088, "grad_norm": 1.3038114309310913, "learning_rate": 0.0002, "epoch": 6.263829787234043, "step": 3680}, {"loss": 1.0416, "grad_norm": 1.3900057077407837, "learning_rate": 0.0002, "epoch": 6.280851063829787, "step": 3690}, {"loss": 0.9832, "grad_norm": 1.196964144706726, "learning_rate": 0.0002, "epoch": 6.297872340425532, "step": 3700}, {"loss": 1.0778, "grad_norm": 1.205865740776062, "learning_rate": 0.0002, "epoch": 6.314893617021276, "step": 3710}, {"loss": 1.0358, "grad_norm": 1.2710838317871094, "learning_rate": 0.0002, "epoch": 6.3319148936170215, "step": 3720}, {"loss": 1.0271, "grad_norm": 1.285942554473877, "learning_rate": 0.0002, "epoch": 6.348936170212766, "step": 3730}, {"loss": 1.0164, "grad_norm": 1.1717636585235596, "learning_rate": 0.0002, "epoch": 6.365957446808511, "step": 3740}, {"loss": 1.0557, "grad_norm": 1.190883994102478, "learning_rate": 0.0002, "epoch": 6.382978723404255, "step": 3750}, {"loss": 1.0319, "grad_norm": 1.1623435020446777, "learning_rate": 0.0002, "epoch": 6.4, "step": 3760}, {"loss": 1.0633, "grad_norm": 1.2285547256469727, "learning_rate": 0.0002, "epoch": 6.417021276595745, "step": 3770}, {"loss": 1.0593, "grad_norm": 1.1142666339874268, "learning_rate": 0.0002, "epoch": 6.43404255319149, "step": 3780}, {"loss": 1.0418, "grad_norm": 1.333337664604187, "learning_rate": 0.0002, "epoch": 6.451063829787234, "step": 3790}, {"loss": 1.0, "grad_norm": 1.350474238395691, "learning_rate": 0.0002, "epoch": 6.468085106382979, "step": 3800}, {"loss": 1.1152, "grad_norm": 1.2439061403274536, "learning_rate": 0.0002, "epoch": 6.485106382978723, "step": 3810}, {"loss": 1.0915, "grad_norm": 1.2488664388656616, "learning_rate": 0.0002, "epoch": 6.502127659574468, "step": 3820}, {"loss": 1.0571, "grad_norm": 1.1990735530853271, "learning_rate": 0.0002, "epoch": 6.519148936170213, "step": 3830}, {"loss": 0.9895, "grad_norm": 1.5180301666259766, "learning_rate": 0.0002, "epoch": 6.536170212765957, "step": 3840}, {"loss": 0.9955, "grad_norm": 1.1273280382156372, "learning_rate": 0.0002, "epoch": 6.553191489361702, "step": 3850}, {"loss": 1.0516, "grad_norm": 1.2778105735778809, "learning_rate": 0.0002, "epoch": 6.5702127659574465, "step": 3860}, {"loss": 1.0039, "grad_norm": 1.1789685487747192, "learning_rate": 0.0002, "epoch": 6.587234042553192, "step": 3870}, {"loss": 1.0381, "grad_norm": 1.2061398029327393, "learning_rate": 0.0002, "epoch": 6.604255319148936, "step": 3880}, {"loss": 1.0775, "grad_norm": 1.104092001914978, "learning_rate": 0.0002, "epoch": 6.621276595744681, "step": 3890}, {"loss": 1.0591, "grad_norm": 1.2648544311523438, "learning_rate": 0.0002, "epoch": 6.638297872340425, "step": 3900}, {"loss": 1.0535, "grad_norm": 1.2267687320709229, "learning_rate": 0.0002, "epoch": 6.6553191489361705, "step": 3910}, {"loss": 1.0654, "grad_norm": 1.3252530097961426, "learning_rate": 0.0002, "epoch": 6.672340425531915, "step": 3920}, {"loss": 1.0301, "grad_norm": 1.284563660621643, "learning_rate": 0.0002, "epoch": 6.68936170212766, "step": 3930}, {"loss": 1.102, "grad_norm": 1.293845534324646, "learning_rate": 0.0002, "epoch": 6.706382978723404, "step": 3940}, {"loss": 1.1526, "grad_norm": 1.2290467023849487, "learning_rate": 0.0002, "epoch": 6.723404255319149, "step": 3950}, {"loss": 1.0474, "grad_norm": 1.1712737083435059, "learning_rate": 0.0002, "epoch": 6.740425531914894, "step": 3960}, {"loss": 1.0149, "grad_norm": 1.1728616952896118, "learning_rate": 0.0002, "epoch": 6.757446808510638, "step": 3970}, {"loss": 1.0824, "grad_norm": 1.154922604560852, "learning_rate": 0.0002, "epoch": 6.774468085106383, "step": 3980}, {"loss": 1.0961, "grad_norm": 1.4673690795898438, "learning_rate": 0.0002, "epoch": 6.791489361702128, "step": 3990}, {"loss": 0.9784, "grad_norm": 1.2338067293167114, "learning_rate": 0.0002, "epoch": 6.808510638297872, "step": 4000}, {"loss": 1.0975, "grad_norm": 1.0775316953659058, "learning_rate": 0.0002, "epoch": 6.825531914893617, "step": 4010}, {"loss": 1.0204, "grad_norm": 1.2518454790115356, "learning_rate": 0.0002, "epoch": 6.842553191489362, "step": 4020}, {"loss": 1.1425, "grad_norm": 1.3534432649612427, "learning_rate": 0.0002, "epoch": 6.859574468085106, "step": 4030}, {"loss": 1.1212, "grad_norm": 1.1217902898788452, "learning_rate": 0.0002, "epoch": 6.876595744680851, "step": 4040}, {"loss": 1.0823, "grad_norm": 1.2672910690307617, "learning_rate": 0.0002, "epoch": 6.8936170212765955, "step": 4050}, {"loss": 1.0817, "grad_norm": 1.3807674646377563, "learning_rate": 0.0002, "epoch": 6.910638297872341, "step": 4060}, {"loss": 1.0576, "grad_norm": 1.064530849456787, "learning_rate": 0.0002, "epoch": 6.927659574468085, "step": 4070}, {"loss": 1.0718, "grad_norm": 1.1286897659301758, "learning_rate": 0.0002, "epoch": 6.94468085106383, "step": 4080}, {"loss": 1.0574, "grad_norm": 1.3736463785171509, "learning_rate": 0.0002, "epoch": 6.961702127659574, "step": 4090}, {"loss": 1.0621, "grad_norm": 1.3167431354522705, "learning_rate": 0.0002, "epoch": 6.9787234042553195, "step": 4100}, {"loss": 0.9754, "grad_norm": 1.2784067392349243, "learning_rate": 0.0002, "epoch": 6.995744680851064, "step": 4110}, {"eval_loss": 2.260930299758911, "eval_runtime": 106.0392, "eval_samples_per_second": 4.857, "eval_steps_per_second": 0.613, "epoch": 6.999148936170212, "step": 4112}, {"loss": 0.8211, "grad_norm": 1.1155035495758057, "learning_rate": 0.0002, "epoch": 7.012765957446809, "step": 4120}, {"loss": 0.8606, "grad_norm": 1.4007865190505981, "learning_rate": 0.0002, "epoch": 7.029787234042553, "step": 4130}, {"loss": 0.8303, "grad_norm": 1.4097480773925781, "learning_rate": 0.0002, "epoch": 7.046808510638298, "step": 4140}, {"loss": 0.8095, "grad_norm": 1.5067437887191772, "learning_rate": 0.0002, "epoch": 7.0638297872340425, "step": 4150}, {"loss": 0.8314, "grad_norm": 1.8971672058105469, "learning_rate": 0.0002, "epoch": 7.080851063829787, "step": 4160}, {"loss": 0.7893, "grad_norm": 1.257439136505127, "learning_rate": 0.0002, "epoch": 7.097872340425532, "step": 4170}, {"loss": 0.8113, "grad_norm": 1.3088364601135254, "learning_rate": 0.0002, "epoch": 7.114893617021276, "step": 4180}, {"loss": 0.8555, "grad_norm": 1.224184274673462, "learning_rate": 0.0002, "epoch": 7.131914893617021, "step": 4190}, {"loss": 0.8493, "grad_norm": 1.5408329963684082, "learning_rate": 0.0002, "epoch": 7.148936170212766, "step": 4200}, {"loss": 0.8345, "grad_norm": 1.6859279870986938, "learning_rate": 0.0002, "epoch": 7.165957446808511, "step": 4210}, {"loss": 0.8519, "grad_norm": 1.4212250709533691, "learning_rate": 0.0002, "epoch": 7.182978723404255, "step": 4220}, {"loss": 0.9346, "grad_norm": 1.5859991312026978, "learning_rate": 0.0002, "epoch": 7.2, "step": 4230}, {"loss": 0.8674, "grad_norm": 1.4653054475784302, "learning_rate": 0.0002, "epoch": 7.217021276595744, "step": 4240}, {"loss": 0.913, "grad_norm": 1.567806363105774, "learning_rate": 0.0002, "epoch": 7.23404255319149, "step": 4250}, {"loss": 0.9355, "grad_norm": 1.470809817314148, "learning_rate": 0.0002, "epoch": 7.251063829787234, "step": 4260}, {"loss": 0.8575, "grad_norm": 1.326292634010315, "learning_rate": 0.0002, "epoch": 7.268085106382979, "step": 4270}, {"loss": 0.8593, "grad_norm": 1.4706473350524902, "learning_rate": 0.0002, "epoch": 7.285106382978723, "step": 4280}, {"loss": 0.8788, "grad_norm": 1.9928194284439087, "learning_rate": 0.0002, "epoch": 7.302127659574468, "step": 4290}, {"loss": 0.8759, "grad_norm": 1.2895413637161255, "learning_rate": 0.0002, "epoch": 7.319148936170213, "step": 4300}, {"loss": 0.8887, "grad_norm": 1.5898326635360718, "learning_rate": 0.0002, "epoch": 7.336170212765958, "step": 4310}, {"loss": 0.8632, "grad_norm": 1.4953527450561523, "learning_rate": 0.0002, "epoch": 7.353191489361702, "step": 4320}, {"loss": 0.8744, "grad_norm": 1.465372085571289, "learning_rate": 0.0002, "epoch": 7.370212765957447, "step": 4330}, {"loss": 0.8155, "grad_norm": 1.5092062950134277, "learning_rate": 0.0002, "epoch": 7.3872340425531915, "step": 4340}, {"loss": 0.9551, "grad_norm": 1.3567780256271362, "learning_rate": 0.0002, "epoch": 7.404255319148936, "step": 4350}, {"loss": 0.8667, "grad_norm": 1.5023396015167236, "learning_rate": 0.0002, "epoch": 7.421276595744681, "step": 4360}, {"loss": 0.8515, "grad_norm": 1.6369168758392334, "learning_rate": 0.0002, "epoch": 7.438297872340425, "step": 4370}, {"loss": 0.9313, "grad_norm": 1.4093835353851318, "learning_rate": 0.0002, "epoch": 7.45531914893617, "step": 4380}, {"loss": 0.861, "grad_norm": 1.2725355625152588, "learning_rate": 0.0002, "epoch": 7.472340425531915, "step": 4390}, {"loss": 0.9065, "grad_norm": 1.455870509147644, "learning_rate": 0.0002, "epoch": 7.48936170212766, "step": 4400}, {"loss": 0.8397, "grad_norm": 1.2592545747756958, "learning_rate": 0.0002, "epoch": 7.506382978723404, "step": 4410}, {"loss": 0.9038, "grad_norm": 1.614005208015442, "learning_rate": 0.0002, "epoch": 7.523404255319149, "step": 4420}, {"loss": 0.9177, "grad_norm": 1.4367144107818604, "learning_rate": 0.0002, "epoch": 7.540425531914893, "step": 4430}, {"loss": 0.8685, "grad_norm": 1.3691469430923462, "learning_rate": 0.0002, "epoch": 7.5574468085106385, "step": 4440}, {"loss": 0.8757, "grad_norm": 1.6138449907302856, "learning_rate": 0.0002, "epoch": 7.574468085106383, "step": 4450}, {"loss": 0.8597, "grad_norm": 1.3140075206756592, "learning_rate": 0.0002, "epoch": 7.591489361702128, "step": 4460}, {"loss": 0.9237, "grad_norm": 1.482589602470398, "learning_rate": 0.0002, "epoch": 7.608510638297872, "step": 4470}, {"loss": 0.9249, "grad_norm": 1.404107928276062, "learning_rate": 0.0002, "epoch": 7.625531914893617, "step": 4480}, {"loss": 0.9213, "grad_norm": 1.6977661848068237, "learning_rate": 0.0002, "epoch": 7.642553191489362, "step": 4490}, {"loss": 0.8681, "grad_norm": 1.4678088426589966, "learning_rate": 0.0002, "epoch": 7.659574468085106, "step": 4500}, {"loss": 0.9467, "grad_norm": 1.7297770977020264, "learning_rate": 0.0002, "epoch": 7.676595744680851, "step": 4510}, {"loss": 0.93, "grad_norm": 1.5900875329971313, "learning_rate": 0.0002, "epoch": 7.693617021276596, "step": 4520}, {"loss": 0.9499, "grad_norm": 1.620308756828308, "learning_rate": 0.0002, "epoch": 7.7106382978723405, "step": 4530}, {"loss": 1.002, "grad_norm": 1.4710882902145386, "learning_rate": 0.0002, "epoch": 7.727659574468085, "step": 4540}, {"loss": 0.9126, "grad_norm": 1.51741361618042, "learning_rate": 0.0002, "epoch": 7.74468085106383, "step": 4550}, {"loss": 0.9209, "grad_norm": 1.5683188438415527, "learning_rate": 0.0002, "epoch": 7.761702127659574, "step": 4560}, {"loss": 0.9852, "grad_norm": 1.387294888496399, "learning_rate": 0.0002, "epoch": 7.778723404255319, "step": 4570}, {"loss": 0.9205, "grad_norm": 1.3634133338928223, "learning_rate": 0.0002, "epoch": 7.7957446808510635, "step": 4580}, {"loss": 0.9959, "grad_norm": 1.469403624534607, "learning_rate": 0.0002, "epoch": 7.812765957446809, "step": 4590}, {"loss": 0.8934, "grad_norm": 1.5683388710021973, "learning_rate": 0.0002, "epoch": 7.829787234042553, "step": 4600}, {"loss": 0.8806, "grad_norm": 1.3234552145004272, "learning_rate": 0.0002, "epoch": 7.846808510638298, "step": 4610}, {"loss": 0.9353, "grad_norm": 1.2532844543457031, "learning_rate": 0.0002, "epoch": 7.863829787234042, "step": 4620}, {"loss": 0.8865, "grad_norm": 1.3591208457946777, "learning_rate": 0.0002, "epoch": 7.8808510638297875, "step": 4630}, {"loss": 0.9419, "grad_norm": 1.366128921508789, "learning_rate": 0.0002, "epoch": 7.897872340425532, "step": 4640}, {"loss": 0.9076, "grad_norm": 1.3230071067810059, "learning_rate": 0.0002, "epoch": 7.914893617021277, "step": 4650}, {"loss": 0.9076, "grad_norm": 1.3713736534118652, "learning_rate": 0.0002, "epoch": 7.931914893617021, "step": 4660}, {"loss": 0.9455, "grad_norm": 1.4915863275527954, "learning_rate": 0.0002, "epoch": 7.948936170212766, "step": 4670}, {"loss": 0.8768, "grad_norm": 1.1782197952270508, "learning_rate": 0.0002, "epoch": 7.965957446808511, "step": 4680}, {"loss": 0.93, "grad_norm": 1.3456854820251465, "learning_rate": 0.0002, "epoch": 7.982978723404255, "step": 4690}]}