diff --git a/.gitattributes b/.gitattributes index d2c6ce4d687a1434d18ca816ac2039389fc5adeb..974238265ba065d9d05cc4e1c59de286d807d07f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -5883,3 +5883,12 @@ gemma-2b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0 gemma-2b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-102380-sd-42/checkpoint-12181/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..50b97dc2b19a417f587ca72cb75ef4f5211bb432 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8080228e091f8054a4eeac78af9455c42c732e862c553466df0eb9e5892a233c +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..50b97dc2b19a417f587ca72cb75ef4f5211bb432 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8080228e091f8054a4eeac78af9455c42c732e862c553466df0eb9e5892a233c +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4271579e8ee17d9ba9ecd3cb4228921a5056216 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba20c22469618b5f3eff8283728ac9b9cb3c6fcaaa73246c3c5bec24f5e6be77 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0e4598c75b0731d75fcd7614cc20c6338acebb45 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5188d8bf5a2a21f558561b800b2985bdc037935631b8c33c74c53b697abb5804 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..39daab3c48f4b4f592e2ca13dd7728ac9acd3699 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77c92af9a68ff3c98e18da78676846d498b325ebd119fdf897d319d101ac4e39 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..933d26ca542bb959d1250a7f699cd088fc8c6d99 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/trainer_state.json @@ -0,0 +1,1106 @@ +{ + "best_metric": 1.1874967813491821, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 1516, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013192612137203167, + "grad_norm": 0.7545632123947144, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 10 + }, + { + "epoch": 0.026385224274406333, + "grad_norm": 0.5787661075592041, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 20 + }, + { + "epoch": 0.0395778364116095, + "grad_norm": 0.8616093993186951, + "learning_rate": 0.0002, + "loss": 1.3906, + "step": 30 + }, + { + "epoch": 0.052770448548812667, + "grad_norm": 0.42088547348976135, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 40 + }, + { + "epoch": 0.06596306068601583, + "grad_norm": 0.47704678773880005, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 50 + }, + { + "epoch": 0.079155672823219, + "grad_norm": 0.5763994455337524, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 60 + }, + { + "epoch": 0.09234828496042216, + "grad_norm": 0.4579846262931824, + "learning_rate": 0.0002, + "loss": 1.2268, + "step": 70 + }, + { + "epoch": 0.10554089709762533, + "grad_norm": 0.46623846888542175, + "learning_rate": 0.0002, + "loss": 1.415, + "step": 80 + }, + { + "epoch": 0.11873350923482849, + "grad_norm": 0.4206956624984741, + "learning_rate": 0.0002, + "loss": 1.2849, + "step": 90 + }, + { + "epoch": 0.13192612137203166, + "grad_norm": 0.41896629333496094, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 100 + }, + { + "epoch": 0.14511873350923482, + "grad_norm": 0.3459089398384094, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 110 + }, + { + "epoch": 0.158311345646438, + "grad_norm": 0.4587327837944031, + "learning_rate": 0.0002, + "loss": 1.2858, + "step": 120 + }, + { + "epoch": 0.17150395778364116, + "grad_norm": 0.433525413274765, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 130 + }, + { + "epoch": 0.18469656992084432, + "grad_norm": 0.39253175258636475, + "learning_rate": 0.0002, + "loss": 1.2269, + "step": 140 + }, + { + "epoch": 0.19788918205804748, + "grad_norm": 0.3602290749549866, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 150 + }, + { + "epoch": 0.21108179419525067, + "grad_norm": 0.41160839796066284, + "learning_rate": 0.0002, + "loss": 1.2834, + "step": 160 + }, + { + "epoch": 0.22427440633245382, + "grad_norm": 0.7213630080223083, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 170 + }, + { + "epoch": 0.23746701846965698, + "grad_norm": 0.39086055755615234, + "learning_rate": 0.0002, + "loss": 1.2215, + "step": 180 + }, + { + "epoch": 0.25065963060686014, + "grad_norm": 0.4465520977973938, + "learning_rate": 0.0002, + "loss": 1.3315, + "step": 190 + }, + { + "epoch": 0.2638522427440633, + "grad_norm": 1.814679741859436, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 200 + }, + { + "epoch": 0.2770448548812665, + "grad_norm": 0.5026423931121826, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 210 + }, + { + "epoch": 0.29023746701846964, + "grad_norm": 0.4156292974948883, + "learning_rate": 0.0002, + "loss": 1.3306, + "step": 220 + }, + { + "epoch": 0.3034300791556728, + "grad_norm": 0.40813493728637695, + "learning_rate": 0.0002, + "loss": 1.266, + "step": 230 + }, + { + "epoch": 0.316622691292876, + "grad_norm": 0.3304787874221802, + "learning_rate": 0.0002, + "loss": 1.1533, + "step": 240 + }, + { + "epoch": 0.32981530343007914, + "grad_norm": 0.46139976382255554, + "learning_rate": 0.0002, + "loss": 1.3154, + "step": 250 + }, + { + "epoch": 0.34300791556728233, + "grad_norm": 0.37518271803855896, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 260 + }, + { + "epoch": 0.3562005277044855, + "grad_norm": 0.35586467385292053, + "learning_rate": 0.0002, + "loss": 1.2004, + "step": 270 + }, + { + "epoch": 0.36939313984168864, + "grad_norm": 0.32441186904907227, + "learning_rate": 0.0002, + "loss": 1.2609, + "step": 280 + }, + { + "epoch": 0.38258575197889183, + "grad_norm": 0.3198683261871338, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 290 + }, + { + "epoch": 0.39577836411609496, + "grad_norm": 0.33663108944892883, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 300 + }, + { + "epoch": 0.40897097625329815, + "grad_norm": 0.3711244761943817, + "learning_rate": 0.0002, + "loss": 1.1702, + "step": 310 + }, + { + "epoch": 0.42216358839050133, + "grad_norm": 0.3209651708602905, + "learning_rate": 0.0002, + "loss": 1.0871, + "step": 320 + }, + { + "epoch": 0.43535620052770446, + "grad_norm": 0.5152716040611267, + "learning_rate": 0.0002, + "loss": 1.2728, + "step": 330 + }, + { + "epoch": 0.44854881266490765, + "grad_norm": 0.5431376695632935, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 340 + }, + { + "epoch": 0.46174142480211083, + "grad_norm": 0.3069997727870941, + "learning_rate": 0.0002, + "loss": 1.1344, + "step": 350 + }, + { + "epoch": 0.47493403693931396, + "grad_norm": 0.34260064363479614, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 360 + }, + { + "epoch": 0.48812664907651715, + "grad_norm": 0.345653235912323, + "learning_rate": 0.0002, + "loss": 1.3161, + "step": 370 + }, + { + "epoch": 0.5013192612137203, + "grad_norm": 0.46222734451293945, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 380 + }, + { + "epoch": 0.5145118733509235, + "grad_norm": 0.27301734685897827, + "learning_rate": 0.0002, + "loss": 1.0616, + "step": 390 + }, + { + "epoch": 0.5277044854881267, + "grad_norm": 0.29048439860343933, + "learning_rate": 0.0002, + "loss": 1.1944, + "step": 400 + }, + { + "epoch": 0.5408970976253298, + "grad_norm": 0.32927802205085754, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 410 + }, + { + "epoch": 0.554089709762533, + "grad_norm": 0.3336397409439087, + "learning_rate": 0.0002, + "loss": 1.2362, + "step": 420 + }, + { + "epoch": 0.5672823218997362, + "grad_norm": 0.4007597267627716, + "learning_rate": 0.0002, + "loss": 1.1445, + "step": 430 + }, + { + "epoch": 0.5804749340369393, + "grad_norm": 0.36144956946372986, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 440 + }, + { + "epoch": 0.5936675461741425, + "grad_norm": 0.6331009864807129, + "learning_rate": 0.0002, + "loss": 1.1604, + "step": 450 + }, + { + "epoch": 0.6068601583113457, + "grad_norm": 0.41469088196754456, + "learning_rate": 0.0002, + "loss": 1.1692, + "step": 460 + }, + { + "epoch": 0.6200527704485488, + "grad_norm": 0.4388185143470764, + "learning_rate": 0.0002, + "loss": 1.2372, + "step": 470 + }, + { + "epoch": 0.633245382585752, + "grad_norm": 0.3738141655921936, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 480 + }, + { + "epoch": 0.6464379947229552, + "grad_norm": 0.7212023138999939, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 490 + }, + { + "epoch": 0.6596306068601583, + "grad_norm": 0.2972351014614105, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 500 + }, + { + "epoch": 0.6728232189973615, + "grad_norm": 0.45293179154396057, + "learning_rate": 0.0002, + "loss": 1.1739, + "step": 510 + }, + { + "epoch": 0.6860158311345647, + "grad_norm": 0.4319860637187958, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 520 + }, + { + "epoch": 0.6992084432717678, + "grad_norm": 0.3050215542316437, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 530 + }, + { + "epoch": 0.712401055408971, + "grad_norm": 0.3552611172199249, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 540 + }, + { + "epoch": 0.7255936675461742, + "grad_norm": 0.3631151020526886, + "learning_rate": 0.0002, + "loss": 1.2546, + "step": 550 + }, + { + "epoch": 0.7387862796833773, + "grad_norm": 0.28177931904792786, + "learning_rate": 0.0002, + "loss": 1.1371, + "step": 560 + }, + { + "epoch": 0.7519788918205804, + "grad_norm": 0.359764039516449, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 570 + }, + { + "epoch": 0.7651715039577837, + "grad_norm": 0.3970327377319336, + "learning_rate": 0.0002, + "loss": 1.1356, + "step": 580 + }, + { + "epoch": 0.7783641160949868, + "grad_norm": 0.3541001081466675, + "learning_rate": 0.0002, + "loss": 1.309, + "step": 590 + }, + { + "epoch": 0.7915567282321899, + "grad_norm": 0.3478573262691498, + "learning_rate": 0.0002, + "loss": 1.2061, + "step": 600 + }, + { + "epoch": 0.8047493403693932, + "grad_norm": 0.3900321424007416, + "learning_rate": 0.0002, + "loss": 1.1864, + "step": 610 + }, + { + "epoch": 0.8179419525065963, + "grad_norm": 0.3443238437175751, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 620 + }, + { + "epoch": 0.8311345646437994, + "grad_norm": 0.44238781929016113, + "learning_rate": 0.0002, + "loss": 1.3232, + "step": 630 + }, + { + "epoch": 0.8443271767810027, + "grad_norm": 0.36339467763900757, + "learning_rate": 0.0002, + "loss": 1.1247, + "step": 640 + }, + { + "epoch": 0.8575197889182058, + "grad_norm": 0.6243070363998413, + "learning_rate": 0.0002, + "loss": 1.1049, + "step": 650 + }, + { + "epoch": 0.8707124010554089, + "grad_norm": 0.3209173381328583, + "learning_rate": 0.0002, + "loss": 1.1943, + "step": 660 + }, + { + "epoch": 0.8839050131926122, + "grad_norm": 0.35017991065979004, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 670 + }, + { + "epoch": 0.8970976253298153, + "grad_norm": 0.3247159421443939, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 680 + }, + { + "epoch": 0.9102902374670184, + "grad_norm": 0.4091894030570984, + "learning_rate": 0.0002, + "loss": 1.2298, + "step": 690 + }, + { + "epoch": 0.9234828496042217, + "grad_norm": 0.3975585997104645, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 700 + }, + { + "epoch": 0.9366754617414248, + "grad_norm": 0.3666245937347412, + "learning_rate": 0.0002, + "loss": 1.281, + "step": 710 + }, + { + "epoch": 0.9498680738786279, + "grad_norm": 0.45216917991638184, + "learning_rate": 0.0002, + "loss": 1.2855, + "step": 720 + }, + { + "epoch": 0.9630606860158312, + "grad_norm": 0.36108118295669556, + "learning_rate": 0.0002, + "loss": 1.2217, + "step": 730 + }, + { + "epoch": 0.9762532981530343, + "grad_norm": 0.44550251960754395, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 740 + }, + { + "epoch": 0.9894459102902374, + "grad_norm": 0.29801255464553833, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 750 + }, + { + "epoch": 1.0, + "eval_loss": 1.203244686126709, + "eval_runtime": 76.0457, + "eval_samples_per_second": 5.668, + "eval_steps_per_second": 0.71, + "step": 758 + }, + { + "epoch": 1.0026385224274406, + "grad_norm": 0.4096551239490509, + "learning_rate": 0.0002, + "loss": 1.0748, + "step": 760 + }, + { + "epoch": 1.0158311345646438, + "grad_norm": 0.2649582326412201, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 770 + }, + { + "epoch": 1.029023746701847, + "grad_norm": 0.3100722134113312, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 780 + }, + { + "epoch": 1.04221635883905, + "grad_norm": 0.3911755383014679, + "learning_rate": 0.0002, + "loss": 1.1736, + "step": 790 + }, + { + "epoch": 1.0554089709762533, + "grad_norm": 0.4600953757762909, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 800 + }, + { + "epoch": 1.0686015831134565, + "grad_norm": 0.28671619296073914, + "learning_rate": 0.0002, + "loss": 1.2869, + "step": 810 + }, + { + "epoch": 1.0817941952506596, + "grad_norm": 0.47282642126083374, + "learning_rate": 0.0002, + "loss": 1.1441, + "step": 820 + }, + { + "epoch": 1.0949868073878628, + "grad_norm": 0.690073549747467, + "learning_rate": 0.0002, + "loss": 1.1368, + "step": 830 + }, + { + "epoch": 1.108179419525066, + "grad_norm": 0.7317902445793152, + "learning_rate": 0.0002, + "loss": 0.9944, + "step": 840 + }, + { + "epoch": 1.121372031662269, + "grad_norm": 0.44215938448905945, + "learning_rate": 0.0002, + "loss": 1.108, + "step": 850 + }, + { + "epoch": 1.1345646437994723, + "grad_norm": 0.33875149488449097, + "learning_rate": 0.0002, + "loss": 1.0558, + "step": 860 + }, + { + "epoch": 1.1477572559366755, + "grad_norm": 0.3700002431869507, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 870 + }, + { + "epoch": 1.1609498680738786, + "grad_norm": 0.41173291206359863, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 880 + }, + { + "epoch": 1.1741424802110818, + "grad_norm": 0.5253589749336243, + "learning_rate": 0.0002, + "loss": 1.0312, + "step": 890 + }, + { + "epoch": 1.187335092348285, + "grad_norm": 0.3912237286567688, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 900 + }, + { + "epoch": 1.200527704485488, + "grad_norm": 0.40990331768989563, + "learning_rate": 0.0002, + "loss": 0.9434, + "step": 910 + }, + { + "epoch": 1.2137203166226913, + "grad_norm": 0.40377968549728394, + "learning_rate": 0.0002, + "loss": 1.0493, + "step": 920 + }, + { + "epoch": 1.2269129287598945, + "grad_norm": 0.4605846405029297, + "learning_rate": 0.0002, + "loss": 1.1496, + "step": 930 + }, + { + "epoch": 1.2401055408970976, + "grad_norm": 0.31564897298812866, + "learning_rate": 0.0002, + "loss": 1.1173, + "step": 940 + }, + { + "epoch": 1.2532981530343008, + "grad_norm": 0.39808550477027893, + "learning_rate": 0.0002, + "loss": 1.0547, + "step": 950 + }, + { + "epoch": 1.266490765171504, + "grad_norm": 0.3762115240097046, + "learning_rate": 0.0002, + "loss": 1.1367, + "step": 960 + }, + { + "epoch": 1.279683377308707, + "grad_norm": 0.4174984097480774, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 970 + }, + { + "epoch": 1.2928759894459103, + "grad_norm": 0.5263054966926575, + "learning_rate": 0.0002, + "loss": 1.1327, + "step": 980 + }, + { + "epoch": 1.3060686015831133, + "grad_norm": 0.41673699021339417, + "learning_rate": 0.0002, + "loss": 1.0339, + "step": 990 + }, + { + "epoch": 1.3192612137203166, + "grad_norm": 0.9613684415817261, + "learning_rate": 0.0002, + "loss": 1.1198, + "step": 1000 + }, + { + "epoch": 1.3324538258575198, + "grad_norm": 0.3690216839313507, + "learning_rate": 0.0002, + "loss": 1.0444, + "step": 1010 + }, + { + "epoch": 1.345646437994723, + "grad_norm": 0.521821141242981, + "learning_rate": 0.0002, + "loss": 1.0473, + "step": 1020 + }, + { + "epoch": 1.358839050131926, + "grad_norm": 0.3353094160556793, + "learning_rate": 0.0002, + "loss": 1.1065, + "step": 1030 + }, + { + "epoch": 1.3720316622691293, + "grad_norm": 0.3843843936920166, + "learning_rate": 0.0002, + "loss": 1.1286, + "step": 1040 + }, + { + "epoch": 1.3852242744063323, + "grad_norm": 0.372514545917511, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 1050 + }, + { + "epoch": 1.3984168865435356, + "grad_norm": 0.34537771344184875, + "learning_rate": 0.0002, + "loss": 1.0041, + "step": 1060 + }, + { + "epoch": 1.4116094986807388, + "grad_norm": 0.45349085330963135, + "learning_rate": 0.0002, + "loss": 1.0251, + "step": 1070 + }, + { + "epoch": 1.424802110817942, + "grad_norm": 0.5120177268981934, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 1080 + }, + { + "epoch": 1.437994722955145, + "grad_norm": 0.42800238728523254, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 1090 + }, + { + "epoch": 1.4511873350923483, + "grad_norm": 0.343832790851593, + "learning_rate": 0.0002, + "loss": 1.0734, + "step": 1100 + }, + { + "epoch": 1.4643799472295513, + "grad_norm": 0.3829841911792755, + "learning_rate": 0.0002, + "loss": 1.1286, + "step": 1110 + }, + { + "epoch": 1.4775725593667546, + "grad_norm": 0.4289931058883667, + "learning_rate": 0.0002, + "loss": 1.1268, + "step": 1120 + }, + { + "epoch": 1.4907651715039578, + "grad_norm": 0.42750850319862366, + "learning_rate": 0.0002, + "loss": 1.0676, + "step": 1130 + }, + { + "epoch": 1.503957783641161, + "grad_norm": 0.34328413009643555, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 1140 + }, + { + "epoch": 1.517150395778364, + "grad_norm": 0.349096417427063, + "learning_rate": 0.0002, + "loss": 1.0863, + "step": 1150 + }, + { + "epoch": 1.5303430079155673, + "grad_norm": 0.7700717449188232, + "learning_rate": 0.0002, + "loss": 1.1462, + "step": 1160 + }, + { + "epoch": 1.5435356200527703, + "grad_norm": 0.39294949173927307, + "learning_rate": 0.0002, + "loss": 0.9528, + "step": 1170 + }, + { + "epoch": 1.5567282321899736, + "grad_norm": 0.36173608899116516, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 1180 + }, + { + "epoch": 1.5699208443271768, + "grad_norm": 0.6034277677536011, + "learning_rate": 0.0002, + "loss": 1.1508, + "step": 1190 + }, + { + "epoch": 1.58311345646438, + "grad_norm": 0.36694103479385376, + "learning_rate": 0.0002, + "loss": 1.0105, + "step": 1200 + }, + { + "epoch": 1.596306068601583, + "grad_norm": 0.4727209508419037, + "learning_rate": 0.0002, + "loss": 1.1479, + "step": 1210 + }, + { + "epoch": 1.6094986807387863, + "grad_norm": 0.6482883095741272, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 1220 + }, + { + "epoch": 1.6226912928759893, + "grad_norm": 0.5238035917282104, + "learning_rate": 0.0002, + "loss": 1.1405, + "step": 1230 + }, + { + "epoch": 1.6358839050131926, + "grad_norm": 0.4812222421169281, + "learning_rate": 0.0002, + "loss": 1.0596, + "step": 1240 + }, + { + "epoch": 1.6490765171503958, + "grad_norm": 0.7131702303886414, + "learning_rate": 0.0002, + "loss": 1.2729, + "step": 1250 + }, + { + "epoch": 1.662269129287599, + "grad_norm": 0.3803327977657318, + "learning_rate": 0.0002, + "loss": 0.9832, + "step": 1260 + }, + { + "epoch": 1.675461741424802, + "grad_norm": 0.3745088577270508, + "learning_rate": 0.0002, + "loss": 1.1433, + "step": 1270 + }, + { + "epoch": 1.6886543535620053, + "grad_norm": 0.4427378475666046, + "learning_rate": 0.0002, + "loss": 1.1018, + "step": 1280 + }, + { + "epoch": 1.7018469656992083, + "grad_norm": 0.797478973865509, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 1290 + }, + { + "epoch": 1.7150395778364116, + "grad_norm": 0.503620982170105, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 1300 + }, + { + "epoch": 1.7282321899736148, + "grad_norm": 0.4132426381111145, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 1310 + }, + { + "epoch": 1.741424802110818, + "grad_norm": 0.41811656951904297, + "learning_rate": 0.0002, + "loss": 1.172, + "step": 1320 + }, + { + "epoch": 1.754617414248021, + "grad_norm": 0.40647849440574646, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1330 + }, + { + "epoch": 1.767810026385224, + "grad_norm": 0.42138347029685974, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 1340 + }, + { + "epoch": 1.7810026385224274, + "grad_norm": 0.46523579955101013, + "learning_rate": 0.0002, + "loss": 0.9955, + "step": 1350 + }, + { + "epoch": 1.7941952506596306, + "grad_norm": 0.39760419726371765, + "learning_rate": 0.0002, + "loss": 1.0307, + "step": 1360 + }, + { + "epoch": 1.8073878627968338, + "grad_norm": 0.37993717193603516, + "learning_rate": 0.0002, + "loss": 1.1218, + "step": 1370 + }, + { + "epoch": 1.820580474934037, + "grad_norm": 0.5404181480407715, + "learning_rate": 0.0002, + "loss": 1.0921, + "step": 1380 + }, + { + "epoch": 1.83377308707124, + "grad_norm": 0.4385245740413666, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1390 + }, + { + "epoch": 1.8469656992084431, + "grad_norm": 0.529797375202179, + "learning_rate": 0.0002, + "loss": 1.1175, + "step": 1400 + }, + { + "epoch": 1.8601583113456464, + "grad_norm": 0.481567919254303, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 1410 + }, + { + "epoch": 1.8733509234828496, + "grad_norm": 0.34787362813949585, + "learning_rate": 0.0002, + "loss": 1.0581, + "step": 1420 + }, + { + "epoch": 1.8865435356200528, + "grad_norm": 0.6402362585067749, + "learning_rate": 0.0002, + "loss": 1.0536, + "step": 1430 + }, + { + "epoch": 1.899736147757256, + "grad_norm": 0.3461322784423828, + "learning_rate": 0.0002, + "loss": 1.0787, + "step": 1440 + }, + { + "epoch": 1.912928759894459, + "grad_norm": 0.44005653262138367, + "learning_rate": 0.0002, + "loss": 1.0925, + "step": 1450 + }, + { + "epoch": 1.9261213720316621, + "grad_norm": 0.4064280688762665, + "learning_rate": 0.0002, + "loss": 1.0414, + "step": 1460 + }, + { + "epoch": 1.9393139841688654, + "grad_norm": 0.5236523151397705, + "learning_rate": 0.0002, + "loss": 1.0608, + "step": 1470 + }, + { + "epoch": 1.9525065963060686, + "grad_norm": 0.41030219197273254, + "learning_rate": 0.0002, + "loss": 1.0572, + "step": 1480 + }, + { + "epoch": 1.9656992084432718, + "grad_norm": 0.39805835485458374, + "learning_rate": 0.0002, + "loss": 1.1204, + "step": 1490 + }, + { + "epoch": 1.978891820580475, + "grad_norm": 0.42974501848220825, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 1500 + }, + { + "epoch": 1.992084432717678, + "grad_norm": 0.4688243865966797, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 1510 + }, + { + "epoch": 2.0, + "eval_loss": 1.1874967813491821, + "eval_runtime": 71.9523, + "eval_samples_per_second": 5.99, + "eval_steps_per_second": 0.75, + "step": 1516 + } + ], + "logging_steps": 10, + "max_steps": 6064, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.015708553851699e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0c0289389bcb2236bf804b0a82c50e67c11a1379 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e04722e246164eebe6b4f70dd50fe946c936a2bd25edd75bcedcfd4c7ff9474e +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ff7935098848a614b70da345a49272c0e49edbc2 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b4d81d7256726d2a15984dbc1440edd1bf4df949ad24ff1587acdbb4f43c46e +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..814ddd1852bee8d06a2837287953d7748739ce58 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:554a6e8c10fe6dadea2f77aec1f22f76b2e7c08edc93a58b6d881da141caf33a +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9dd74078dac9f17884ac7e7fbaf3b39ba7eb1a57 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37b989171d122b4cdd970d745ba042f12a5ab7aba4f8cb713a66da51f2fe1722 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..87b95deb879270214a211cd26d9c8971af205c76 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b93f15eec18327be530ccb8601bd0dfe23cbdc7849bea310aa5348d218c0915 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e48651aea8ee5fad2955453c745d3260475d2ec7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/trainer_state.json @@ -0,0 +1,1646 @@ +{ + "best_metric": 1.1874967813491821, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516", + "epoch": 3.0, + "eval_steps": 10, + "global_step": 2274, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013192612137203167, + "grad_norm": 0.7545632123947144, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 10 + }, + { + "epoch": 0.026385224274406333, + "grad_norm": 0.5787661075592041, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 20 + }, + { + "epoch": 0.0395778364116095, + "grad_norm": 0.8616093993186951, + "learning_rate": 0.0002, + "loss": 1.3906, + "step": 30 + }, + { + "epoch": 0.052770448548812667, + "grad_norm": 0.42088547348976135, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 40 + }, + { + "epoch": 0.06596306068601583, + "grad_norm": 0.47704678773880005, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 50 + }, + { + "epoch": 0.079155672823219, + "grad_norm": 0.5763994455337524, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 60 + }, + { + "epoch": 0.09234828496042216, + "grad_norm": 0.4579846262931824, + "learning_rate": 0.0002, + "loss": 1.2268, + "step": 70 + }, + { + "epoch": 0.10554089709762533, + "grad_norm": 0.46623846888542175, + "learning_rate": 0.0002, + "loss": 1.415, + "step": 80 + }, + { + "epoch": 0.11873350923482849, + "grad_norm": 0.4206956624984741, + "learning_rate": 0.0002, + "loss": 1.2849, + "step": 90 + }, + { + "epoch": 0.13192612137203166, + "grad_norm": 0.41896629333496094, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 100 + }, + { + "epoch": 0.14511873350923482, + "grad_norm": 0.3459089398384094, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 110 + }, + { + "epoch": 0.158311345646438, + "grad_norm": 0.4587327837944031, + "learning_rate": 0.0002, + "loss": 1.2858, + "step": 120 + }, + { + "epoch": 0.17150395778364116, + "grad_norm": 0.433525413274765, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 130 + }, + { + "epoch": 0.18469656992084432, + "grad_norm": 0.39253175258636475, + "learning_rate": 0.0002, + "loss": 1.2269, + "step": 140 + }, + { + "epoch": 0.19788918205804748, + "grad_norm": 0.3602290749549866, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 150 + }, + { + "epoch": 0.21108179419525067, + "grad_norm": 0.41160839796066284, + "learning_rate": 0.0002, + "loss": 1.2834, + "step": 160 + }, + { + "epoch": 0.22427440633245382, + "grad_norm": 0.7213630080223083, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 170 + }, + { + "epoch": 0.23746701846965698, + "grad_norm": 0.39086055755615234, + "learning_rate": 0.0002, + "loss": 1.2215, + "step": 180 + }, + { + "epoch": 0.25065963060686014, + "grad_norm": 0.4465520977973938, + "learning_rate": 0.0002, + "loss": 1.3315, + "step": 190 + }, + { + "epoch": 0.2638522427440633, + "grad_norm": 1.814679741859436, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 200 + }, + { + "epoch": 0.2770448548812665, + "grad_norm": 0.5026423931121826, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 210 + }, + { + "epoch": 0.29023746701846964, + "grad_norm": 0.4156292974948883, + "learning_rate": 0.0002, + "loss": 1.3306, + "step": 220 + }, + { + "epoch": 0.3034300791556728, + "grad_norm": 0.40813493728637695, + "learning_rate": 0.0002, + "loss": 1.266, + "step": 230 + }, + { + "epoch": 0.316622691292876, + "grad_norm": 0.3304787874221802, + "learning_rate": 0.0002, + "loss": 1.1533, + "step": 240 + }, + { + "epoch": 0.32981530343007914, + "grad_norm": 0.46139976382255554, + "learning_rate": 0.0002, + "loss": 1.3154, + "step": 250 + }, + { + "epoch": 0.34300791556728233, + "grad_norm": 0.37518271803855896, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 260 + }, + { + "epoch": 0.3562005277044855, + "grad_norm": 0.35586467385292053, + "learning_rate": 0.0002, + "loss": 1.2004, + "step": 270 + }, + { + "epoch": 0.36939313984168864, + "grad_norm": 0.32441186904907227, + "learning_rate": 0.0002, + "loss": 1.2609, + "step": 280 + }, + { + "epoch": 0.38258575197889183, + "grad_norm": 0.3198683261871338, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 290 + }, + { + "epoch": 0.39577836411609496, + "grad_norm": 0.33663108944892883, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 300 + }, + { + "epoch": 0.40897097625329815, + "grad_norm": 0.3711244761943817, + "learning_rate": 0.0002, + "loss": 1.1702, + "step": 310 + }, + { + "epoch": 0.42216358839050133, + "grad_norm": 0.3209651708602905, + "learning_rate": 0.0002, + "loss": 1.0871, + "step": 320 + }, + { + "epoch": 0.43535620052770446, + "grad_norm": 0.5152716040611267, + "learning_rate": 0.0002, + "loss": 1.2728, + "step": 330 + }, + { + "epoch": 0.44854881266490765, + "grad_norm": 0.5431376695632935, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 340 + }, + { + "epoch": 0.46174142480211083, + "grad_norm": 0.3069997727870941, + "learning_rate": 0.0002, + "loss": 1.1344, + "step": 350 + }, + { + "epoch": 0.47493403693931396, + "grad_norm": 0.34260064363479614, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 360 + }, + { + "epoch": 0.48812664907651715, + "grad_norm": 0.345653235912323, + "learning_rate": 0.0002, + "loss": 1.3161, + "step": 370 + }, + { + "epoch": 0.5013192612137203, + "grad_norm": 0.46222734451293945, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 380 + }, + { + "epoch": 0.5145118733509235, + "grad_norm": 0.27301734685897827, + "learning_rate": 0.0002, + "loss": 1.0616, + "step": 390 + }, + { + "epoch": 0.5277044854881267, + "grad_norm": 0.29048439860343933, + "learning_rate": 0.0002, + "loss": 1.1944, + "step": 400 + }, + { + "epoch": 0.5408970976253298, + "grad_norm": 0.32927802205085754, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 410 + }, + { + "epoch": 0.554089709762533, + "grad_norm": 0.3336397409439087, + "learning_rate": 0.0002, + "loss": 1.2362, + "step": 420 + }, + { + "epoch": 0.5672823218997362, + "grad_norm": 0.4007597267627716, + "learning_rate": 0.0002, + "loss": 1.1445, + "step": 430 + }, + { + "epoch": 0.5804749340369393, + "grad_norm": 0.36144956946372986, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 440 + }, + { + "epoch": 0.5936675461741425, + "grad_norm": 0.6331009864807129, + "learning_rate": 0.0002, + "loss": 1.1604, + "step": 450 + }, + { + "epoch": 0.6068601583113457, + "grad_norm": 0.41469088196754456, + "learning_rate": 0.0002, + "loss": 1.1692, + "step": 460 + }, + { + "epoch": 0.6200527704485488, + "grad_norm": 0.4388185143470764, + "learning_rate": 0.0002, + "loss": 1.2372, + "step": 470 + }, + { + "epoch": 0.633245382585752, + "grad_norm": 0.3738141655921936, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 480 + }, + { + "epoch": 0.6464379947229552, + "grad_norm": 0.7212023138999939, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 490 + }, + { + "epoch": 0.6596306068601583, + "grad_norm": 0.2972351014614105, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 500 + }, + { + "epoch": 0.6728232189973615, + "grad_norm": 0.45293179154396057, + "learning_rate": 0.0002, + "loss": 1.1739, + "step": 510 + }, + { + "epoch": 0.6860158311345647, + "grad_norm": 0.4319860637187958, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 520 + }, + { + "epoch": 0.6992084432717678, + "grad_norm": 0.3050215542316437, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 530 + }, + { + "epoch": 0.712401055408971, + "grad_norm": 0.3552611172199249, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 540 + }, + { + "epoch": 0.7255936675461742, + "grad_norm": 0.3631151020526886, + "learning_rate": 0.0002, + "loss": 1.2546, + "step": 550 + }, + { + "epoch": 0.7387862796833773, + "grad_norm": 0.28177931904792786, + "learning_rate": 0.0002, + "loss": 1.1371, + "step": 560 + }, + { + "epoch": 0.7519788918205804, + "grad_norm": 0.359764039516449, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 570 + }, + { + "epoch": 0.7651715039577837, + "grad_norm": 0.3970327377319336, + "learning_rate": 0.0002, + "loss": 1.1356, + "step": 580 + }, + { + "epoch": 0.7783641160949868, + "grad_norm": 0.3541001081466675, + "learning_rate": 0.0002, + "loss": 1.309, + "step": 590 + }, + { + "epoch": 0.7915567282321899, + "grad_norm": 0.3478573262691498, + "learning_rate": 0.0002, + "loss": 1.2061, + "step": 600 + }, + { + "epoch": 0.8047493403693932, + "grad_norm": 0.3900321424007416, + "learning_rate": 0.0002, + "loss": 1.1864, + "step": 610 + }, + { + "epoch": 0.8179419525065963, + "grad_norm": 0.3443238437175751, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 620 + }, + { + "epoch": 0.8311345646437994, + "grad_norm": 0.44238781929016113, + "learning_rate": 0.0002, + "loss": 1.3232, + "step": 630 + }, + { + "epoch": 0.8443271767810027, + "grad_norm": 0.36339467763900757, + "learning_rate": 0.0002, + "loss": 1.1247, + "step": 640 + }, + { + "epoch": 0.8575197889182058, + "grad_norm": 0.6243070363998413, + "learning_rate": 0.0002, + "loss": 1.1049, + "step": 650 + }, + { + "epoch": 0.8707124010554089, + "grad_norm": 0.3209173381328583, + "learning_rate": 0.0002, + "loss": 1.1943, + "step": 660 + }, + { + "epoch": 0.8839050131926122, + "grad_norm": 0.35017991065979004, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 670 + }, + { + "epoch": 0.8970976253298153, + "grad_norm": 0.3247159421443939, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 680 + }, + { + "epoch": 0.9102902374670184, + "grad_norm": 0.4091894030570984, + "learning_rate": 0.0002, + "loss": 1.2298, + "step": 690 + }, + { + "epoch": 0.9234828496042217, + "grad_norm": 0.3975585997104645, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 700 + }, + { + "epoch": 0.9366754617414248, + "grad_norm": 0.3666245937347412, + "learning_rate": 0.0002, + "loss": 1.281, + "step": 710 + }, + { + "epoch": 0.9498680738786279, + "grad_norm": 0.45216917991638184, + "learning_rate": 0.0002, + "loss": 1.2855, + "step": 720 + }, + { + "epoch": 0.9630606860158312, + "grad_norm": 0.36108118295669556, + "learning_rate": 0.0002, + "loss": 1.2217, + "step": 730 + }, + { + "epoch": 0.9762532981530343, + "grad_norm": 0.44550251960754395, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 740 + }, + { + "epoch": 0.9894459102902374, + "grad_norm": 0.29801255464553833, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 750 + }, + { + "epoch": 1.0, + "eval_loss": 1.203244686126709, + "eval_runtime": 76.0457, + "eval_samples_per_second": 5.668, + "eval_steps_per_second": 0.71, + "step": 758 + }, + { + "epoch": 1.0026385224274406, + "grad_norm": 0.4096551239490509, + "learning_rate": 0.0002, + "loss": 1.0748, + "step": 760 + }, + { + "epoch": 1.0158311345646438, + "grad_norm": 0.2649582326412201, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 770 + }, + { + "epoch": 1.029023746701847, + "grad_norm": 0.3100722134113312, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 780 + }, + { + "epoch": 1.04221635883905, + "grad_norm": 0.3911755383014679, + "learning_rate": 0.0002, + "loss": 1.1736, + "step": 790 + }, + { + "epoch": 1.0554089709762533, + "grad_norm": 0.4600953757762909, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 800 + }, + { + "epoch": 1.0686015831134565, + "grad_norm": 0.28671619296073914, + "learning_rate": 0.0002, + "loss": 1.2869, + "step": 810 + }, + { + "epoch": 1.0817941952506596, + "grad_norm": 0.47282642126083374, + "learning_rate": 0.0002, + "loss": 1.1441, + "step": 820 + }, + { + "epoch": 1.0949868073878628, + "grad_norm": 0.690073549747467, + "learning_rate": 0.0002, + "loss": 1.1368, + "step": 830 + }, + { + "epoch": 1.108179419525066, + "grad_norm": 0.7317902445793152, + "learning_rate": 0.0002, + "loss": 0.9944, + "step": 840 + }, + { + "epoch": 1.121372031662269, + "grad_norm": 0.44215938448905945, + "learning_rate": 0.0002, + "loss": 1.108, + "step": 850 + }, + { + "epoch": 1.1345646437994723, + "grad_norm": 0.33875149488449097, + "learning_rate": 0.0002, + "loss": 1.0558, + "step": 860 + }, + { + "epoch": 1.1477572559366755, + "grad_norm": 0.3700002431869507, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 870 + }, + { + "epoch": 1.1609498680738786, + "grad_norm": 0.41173291206359863, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 880 + }, + { + "epoch": 1.1741424802110818, + "grad_norm": 0.5253589749336243, + "learning_rate": 0.0002, + "loss": 1.0312, + "step": 890 + }, + { + "epoch": 1.187335092348285, + "grad_norm": 0.3912237286567688, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 900 + }, + { + "epoch": 1.200527704485488, + "grad_norm": 0.40990331768989563, + "learning_rate": 0.0002, + "loss": 0.9434, + "step": 910 + }, + { + "epoch": 1.2137203166226913, + "grad_norm": 0.40377968549728394, + "learning_rate": 0.0002, + "loss": 1.0493, + "step": 920 + }, + { + "epoch": 1.2269129287598945, + "grad_norm": 0.4605846405029297, + "learning_rate": 0.0002, + "loss": 1.1496, + "step": 930 + }, + { + "epoch": 1.2401055408970976, + "grad_norm": 0.31564897298812866, + "learning_rate": 0.0002, + "loss": 1.1173, + "step": 940 + }, + { + "epoch": 1.2532981530343008, + "grad_norm": 0.39808550477027893, + "learning_rate": 0.0002, + "loss": 1.0547, + "step": 950 + }, + { + "epoch": 1.266490765171504, + "grad_norm": 0.3762115240097046, + "learning_rate": 0.0002, + "loss": 1.1367, + "step": 960 + }, + { + "epoch": 1.279683377308707, + "grad_norm": 0.4174984097480774, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 970 + }, + { + "epoch": 1.2928759894459103, + "grad_norm": 0.5263054966926575, + "learning_rate": 0.0002, + "loss": 1.1327, + "step": 980 + }, + { + "epoch": 1.3060686015831133, + "grad_norm": 0.41673699021339417, + "learning_rate": 0.0002, + "loss": 1.0339, + "step": 990 + }, + { + "epoch": 1.3192612137203166, + "grad_norm": 0.9613684415817261, + "learning_rate": 0.0002, + "loss": 1.1198, + "step": 1000 + }, + { + "epoch": 1.3324538258575198, + "grad_norm": 0.3690216839313507, + "learning_rate": 0.0002, + "loss": 1.0444, + "step": 1010 + }, + { + "epoch": 1.345646437994723, + "grad_norm": 0.521821141242981, + "learning_rate": 0.0002, + "loss": 1.0473, + "step": 1020 + }, + { + "epoch": 1.358839050131926, + "grad_norm": 0.3353094160556793, + "learning_rate": 0.0002, + "loss": 1.1065, + "step": 1030 + }, + { + "epoch": 1.3720316622691293, + "grad_norm": 0.3843843936920166, + "learning_rate": 0.0002, + "loss": 1.1286, + "step": 1040 + }, + { + "epoch": 1.3852242744063323, + "grad_norm": 0.372514545917511, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 1050 + }, + { + "epoch": 1.3984168865435356, + "grad_norm": 0.34537771344184875, + "learning_rate": 0.0002, + "loss": 1.0041, + "step": 1060 + }, + { + "epoch": 1.4116094986807388, + "grad_norm": 0.45349085330963135, + "learning_rate": 0.0002, + "loss": 1.0251, + "step": 1070 + }, + { + "epoch": 1.424802110817942, + "grad_norm": 0.5120177268981934, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 1080 + }, + { + "epoch": 1.437994722955145, + "grad_norm": 0.42800238728523254, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 1090 + }, + { + "epoch": 1.4511873350923483, + "grad_norm": 0.343832790851593, + "learning_rate": 0.0002, + "loss": 1.0734, + "step": 1100 + }, + { + "epoch": 1.4643799472295513, + "grad_norm": 0.3829841911792755, + "learning_rate": 0.0002, + "loss": 1.1286, + "step": 1110 + }, + { + "epoch": 1.4775725593667546, + "grad_norm": 0.4289931058883667, + "learning_rate": 0.0002, + "loss": 1.1268, + "step": 1120 + }, + { + "epoch": 1.4907651715039578, + "grad_norm": 0.42750850319862366, + "learning_rate": 0.0002, + "loss": 1.0676, + "step": 1130 + }, + { + "epoch": 1.503957783641161, + "grad_norm": 0.34328413009643555, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 1140 + }, + { + "epoch": 1.517150395778364, + "grad_norm": 0.349096417427063, + "learning_rate": 0.0002, + "loss": 1.0863, + "step": 1150 + }, + { + "epoch": 1.5303430079155673, + "grad_norm": 0.7700717449188232, + "learning_rate": 0.0002, + "loss": 1.1462, + "step": 1160 + }, + { + "epoch": 1.5435356200527703, + "grad_norm": 0.39294949173927307, + "learning_rate": 0.0002, + "loss": 0.9528, + "step": 1170 + }, + { + "epoch": 1.5567282321899736, + "grad_norm": 0.36173608899116516, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 1180 + }, + { + "epoch": 1.5699208443271768, + "grad_norm": 0.6034277677536011, + "learning_rate": 0.0002, + "loss": 1.1508, + "step": 1190 + }, + { + "epoch": 1.58311345646438, + "grad_norm": 0.36694103479385376, + "learning_rate": 0.0002, + "loss": 1.0105, + "step": 1200 + }, + { + "epoch": 1.596306068601583, + "grad_norm": 0.4727209508419037, + "learning_rate": 0.0002, + "loss": 1.1479, + "step": 1210 + }, + { + "epoch": 1.6094986807387863, + "grad_norm": 0.6482883095741272, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 1220 + }, + { + "epoch": 1.6226912928759893, + "grad_norm": 0.5238035917282104, + "learning_rate": 0.0002, + "loss": 1.1405, + "step": 1230 + }, + { + "epoch": 1.6358839050131926, + "grad_norm": 0.4812222421169281, + "learning_rate": 0.0002, + "loss": 1.0596, + "step": 1240 + }, + { + "epoch": 1.6490765171503958, + "grad_norm": 0.7131702303886414, + "learning_rate": 0.0002, + "loss": 1.2729, + "step": 1250 + }, + { + "epoch": 1.662269129287599, + "grad_norm": 0.3803327977657318, + "learning_rate": 0.0002, + "loss": 0.9832, + "step": 1260 + }, + { + "epoch": 1.675461741424802, + "grad_norm": 0.3745088577270508, + "learning_rate": 0.0002, + "loss": 1.1433, + "step": 1270 + }, + { + "epoch": 1.6886543535620053, + "grad_norm": 0.4427378475666046, + "learning_rate": 0.0002, + "loss": 1.1018, + "step": 1280 + }, + { + "epoch": 1.7018469656992083, + "grad_norm": 0.797478973865509, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 1290 + }, + { + "epoch": 1.7150395778364116, + "grad_norm": 0.503620982170105, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 1300 + }, + { + "epoch": 1.7282321899736148, + "grad_norm": 0.4132426381111145, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 1310 + }, + { + "epoch": 1.741424802110818, + "grad_norm": 0.41811656951904297, + "learning_rate": 0.0002, + "loss": 1.172, + "step": 1320 + }, + { + "epoch": 1.754617414248021, + "grad_norm": 0.40647849440574646, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1330 + }, + { + "epoch": 1.767810026385224, + "grad_norm": 0.42138347029685974, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 1340 + }, + { + "epoch": 1.7810026385224274, + "grad_norm": 0.46523579955101013, + "learning_rate": 0.0002, + "loss": 0.9955, + "step": 1350 + }, + { + "epoch": 1.7941952506596306, + "grad_norm": 0.39760419726371765, + "learning_rate": 0.0002, + "loss": 1.0307, + "step": 1360 + }, + { + "epoch": 1.8073878627968338, + "grad_norm": 0.37993717193603516, + "learning_rate": 0.0002, + "loss": 1.1218, + "step": 1370 + }, + { + "epoch": 1.820580474934037, + "grad_norm": 0.5404181480407715, + "learning_rate": 0.0002, + "loss": 1.0921, + "step": 1380 + }, + { + "epoch": 1.83377308707124, + "grad_norm": 0.4385245740413666, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1390 + }, + { + "epoch": 1.8469656992084431, + "grad_norm": 0.529797375202179, + "learning_rate": 0.0002, + "loss": 1.1175, + "step": 1400 + }, + { + "epoch": 1.8601583113456464, + "grad_norm": 0.481567919254303, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 1410 + }, + { + "epoch": 1.8733509234828496, + "grad_norm": 0.34787362813949585, + "learning_rate": 0.0002, + "loss": 1.0581, + "step": 1420 + }, + { + "epoch": 1.8865435356200528, + "grad_norm": 0.6402362585067749, + "learning_rate": 0.0002, + "loss": 1.0536, + "step": 1430 + }, + { + "epoch": 1.899736147757256, + "grad_norm": 0.3461322784423828, + "learning_rate": 0.0002, + "loss": 1.0787, + "step": 1440 + }, + { + "epoch": 1.912928759894459, + "grad_norm": 0.44005653262138367, + "learning_rate": 0.0002, + "loss": 1.0925, + "step": 1450 + }, + { + "epoch": 1.9261213720316621, + "grad_norm": 0.4064280688762665, + "learning_rate": 0.0002, + "loss": 1.0414, + "step": 1460 + }, + { + "epoch": 1.9393139841688654, + "grad_norm": 0.5236523151397705, + "learning_rate": 0.0002, + "loss": 1.0608, + "step": 1470 + }, + { + "epoch": 1.9525065963060686, + "grad_norm": 0.41030219197273254, + "learning_rate": 0.0002, + "loss": 1.0572, + "step": 1480 + }, + { + "epoch": 1.9656992084432718, + "grad_norm": 0.39805835485458374, + "learning_rate": 0.0002, + "loss": 1.1204, + "step": 1490 + }, + { + "epoch": 1.978891820580475, + "grad_norm": 0.42974501848220825, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 1500 + }, + { + "epoch": 1.992084432717678, + "grad_norm": 0.4688243865966797, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 1510 + }, + { + "epoch": 2.0, + "eval_loss": 1.1874967813491821, + "eval_runtime": 71.9523, + "eval_samples_per_second": 5.99, + "eval_steps_per_second": 0.75, + "step": 1516 + }, + { + "epoch": 2.005277044854881, + "grad_norm": 0.4121631383895874, + "learning_rate": 0.0002, + "loss": 1.0401, + "step": 1520 + }, + { + "epoch": 2.0184696569920844, + "grad_norm": 0.4844197928905487, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 1530 + }, + { + "epoch": 2.0316622691292876, + "grad_norm": 0.45408546924591064, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 1540 + }, + { + "epoch": 2.044854881266491, + "grad_norm": 0.48662951588630676, + "learning_rate": 0.0002, + "loss": 1.0399, + "step": 1550 + }, + { + "epoch": 2.058047493403694, + "grad_norm": 0.7195899486541748, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 1560 + }, + { + "epoch": 2.0712401055408973, + "grad_norm": 0.5071077346801758, + "learning_rate": 0.0002, + "loss": 0.9486, + "step": 1570 + }, + { + "epoch": 2.0844327176781, + "grad_norm": 0.7473958730697632, + "learning_rate": 0.0002, + "loss": 1.0055, + "step": 1580 + }, + { + "epoch": 2.0976253298153034, + "grad_norm": 0.5509232878684998, + "learning_rate": 0.0002, + "loss": 0.8309, + "step": 1590 + }, + { + "epoch": 2.1108179419525066, + "grad_norm": 0.5108042359352112, + "learning_rate": 0.0002, + "loss": 0.9181, + "step": 1600 + }, + { + "epoch": 2.12401055408971, + "grad_norm": 0.42331448197364807, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 1610 + }, + { + "epoch": 2.137203166226913, + "grad_norm": 0.46621623635292053, + "learning_rate": 0.0002, + "loss": 0.9359, + "step": 1620 + }, + { + "epoch": 2.150395778364116, + "grad_norm": 0.43802836537361145, + "learning_rate": 0.0002, + "loss": 0.9065, + "step": 1630 + }, + { + "epoch": 2.163588390501319, + "grad_norm": 0.49908021092414856, + "learning_rate": 0.0002, + "loss": 0.9375, + "step": 1640 + }, + { + "epoch": 2.1767810026385224, + "grad_norm": 0.4195636808872223, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 1650 + }, + { + "epoch": 2.1899736147757256, + "grad_norm": 0.49515822529792786, + "learning_rate": 0.0002, + "loss": 0.8501, + "step": 1660 + }, + { + "epoch": 2.203166226912929, + "grad_norm": 0.4607589542865753, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 1670 + }, + { + "epoch": 2.216358839050132, + "grad_norm": 0.4489196836948395, + "learning_rate": 0.0002, + "loss": 1.0206, + "step": 1680 + }, + { + "epoch": 2.229551451187335, + "grad_norm": 0.49300864338874817, + "learning_rate": 0.0002, + "loss": 0.9657, + "step": 1690 + }, + { + "epoch": 2.242744063324538, + "grad_norm": 0.6624954342842102, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 1700 + }, + { + "epoch": 2.2559366754617414, + "grad_norm": 0.8391500115394592, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 1710 + }, + { + "epoch": 2.2691292875989446, + "grad_norm": 0.5193073749542236, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 1720 + }, + { + "epoch": 2.282321899736148, + "grad_norm": 0.6180613040924072, + "learning_rate": 0.0002, + "loss": 0.9979, + "step": 1730 + }, + { + "epoch": 2.295514511873351, + "grad_norm": 0.591191291809082, + "learning_rate": 0.0002, + "loss": 0.9579, + "step": 1740 + }, + { + "epoch": 2.308707124010554, + "grad_norm": 0.546897828578949, + "learning_rate": 0.0002, + "loss": 0.8879, + "step": 1750 + }, + { + "epoch": 2.321899736147757, + "grad_norm": 0.5470401644706726, + "learning_rate": 0.0002, + "loss": 0.9321, + "step": 1760 + }, + { + "epoch": 2.3350923482849604, + "grad_norm": 0.4590282738208771, + "learning_rate": 0.0002, + "loss": 0.9104, + "step": 1770 + }, + { + "epoch": 2.3482849604221636, + "grad_norm": 0.622164785861969, + "learning_rate": 0.0002, + "loss": 1.0384, + "step": 1780 + }, + { + "epoch": 2.361477572559367, + "grad_norm": 0.5753812193870544, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 1790 + }, + { + "epoch": 2.37467018469657, + "grad_norm": 0.47958624362945557, + "learning_rate": 0.0002, + "loss": 0.9569, + "step": 1800 + }, + { + "epoch": 2.387862796833773, + "grad_norm": 0.48042672872543335, + "learning_rate": 0.0002, + "loss": 1.0519, + "step": 1810 + }, + { + "epoch": 2.401055408970976, + "grad_norm": 0.44586366415023804, + "learning_rate": 0.0002, + "loss": 0.9042, + "step": 1820 + }, + { + "epoch": 2.4142480211081794, + "grad_norm": 0.7239416837692261, + "learning_rate": 0.0002, + "loss": 0.9783, + "step": 1830 + }, + { + "epoch": 2.4274406332453826, + "grad_norm": 0.5515341758728027, + "learning_rate": 0.0002, + "loss": 0.8818, + "step": 1840 + }, + { + "epoch": 2.440633245382586, + "grad_norm": 0.6280064582824707, + "learning_rate": 0.0002, + "loss": 0.9503, + "step": 1850 + }, + { + "epoch": 2.453825857519789, + "grad_norm": 0.4832057058811188, + "learning_rate": 0.0002, + "loss": 0.8943, + "step": 1860 + }, + { + "epoch": 2.467018469656992, + "grad_norm": 0.5789321064949036, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 1870 + }, + { + "epoch": 2.480211081794195, + "grad_norm": 0.48491886258125305, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 1880 + }, + { + "epoch": 2.4934036939313984, + "grad_norm": 0.532365620136261, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 1890 + }, + { + "epoch": 2.5065963060686016, + "grad_norm": 0.7087852954864502, + "learning_rate": 0.0002, + "loss": 0.9734, + "step": 1900 + }, + { + "epoch": 2.519788918205805, + "grad_norm": 0.48157402873039246, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 1910 + }, + { + "epoch": 2.532981530343008, + "grad_norm": 0.5886041522026062, + "learning_rate": 0.0002, + "loss": 0.9851, + "step": 1920 + }, + { + "epoch": 2.5461741424802113, + "grad_norm": 0.6332622766494751, + "learning_rate": 0.0002, + "loss": 1.0144, + "step": 1930 + }, + { + "epoch": 2.559366754617414, + "grad_norm": 0.5463117957115173, + "learning_rate": 0.0002, + "loss": 0.9516, + "step": 1940 + }, + { + "epoch": 2.5725593667546174, + "grad_norm": 0.5432228446006775, + "learning_rate": 0.0002, + "loss": 0.9373, + "step": 1950 + }, + { + "epoch": 2.5857519788918206, + "grad_norm": 0.5929186940193176, + "learning_rate": 0.0002, + "loss": 0.8974, + "step": 1960 + }, + { + "epoch": 2.598944591029024, + "grad_norm": 0.5120641589164734, + "learning_rate": 0.0002, + "loss": 1.0062, + "step": 1970 + }, + { + "epoch": 2.6121372031662267, + "grad_norm": 0.5372339487075806, + "learning_rate": 0.0002, + "loss": 0.9143, + "step": 1980 + }, + { + "epoch": 2.62532981530343, + "grad_norm": 0.5519838929176331, + "learning_rate": 0.0002, + "loss": 1.0786, + "step": 1990 + }, + { + "epoch": 2.638522427440633, + "grad_norm": 0.7304037809371948, + "learning_rate": 0.0002, + "loss": 1.021, + "step": 2000 + }, + { + "epoch": 2.6517150395778364, + "grad_norm": 0.6182340979576111, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 2010 + }, + { + "epoch": 2.6649076517150396, + "grad_norm": 0.4874444305896759, + "learning_rate": 0.0002, + "loss": 1.0501, + "step": 2020 + }, + { + "epoch": 2.678100263852243, + "grad_norm": 0.5850239396095276, + "learning_rate": 0.0002, + "loss": 0.9612, + "step": 2030 + }, + { + "epoch": 2.691292875989446, + "grad_norm": 0.6495311856269836, + "learning_rate": 0.0002, + "loss": 0.9808, + "step": 2040 + }, + { + "epoch": 2.7044854881266494, + "grad_norm": 1.002830147743225, + "learning_rate": 0.0002, + "loss": 1.0187, + "step": 2050 + }, + { + "epoch": 2.717678100263852, + "grad_norm": 0.49076753854751587, + "learning_rate": 0.0002, + "loss": 1.0386, + "step": 2060 + }, + { + "epoch": 2.7308707124010554, + "grad_norm": 0.4736326336860657, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 2070 + }, + { + "epoch": 2.7440633245382586, + "grad_norm": 0.5527601838111877, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 2080 + }, + { + "epoch": 2.757255936675462, + "grad_norm": 0.7295718193054199, + "learning_rate": 0.0002, + "loss": 0.9149, + "step": 2090 + }, + { + "epoch": 2.7704485488126647, + "grad_norm": 0.5437536835670471, + "learning_rate": 0.0002, + "loss": 0.9032, + "step": 2100 + }, + { + "epoch": 2.783641160949868, + "grad_norm": 0.5997128486633301, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2110 + }, + { + "epoch": 2.796833773087071, + "grad_norm": 0.6498191356658936, + "learning_rate": 0.0002, + "loss": 0.976, + "step": 2120 + }, + { + "epoch": 2.8100263852242744, + "grad_norm": 0.5237268805503845, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 2130 + }, + { + "epoch": 2.8232189973614776, + "grad_norm": 0.6033027172088623, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 2140 + }, + { + "epoch": 2.836411609498681, + "grad_norm": 0.6077138781547546, + "learning_rate": 0.0002, + "loss": 0.9625, + "step": 2150 + }, + { + "epoch": 2.849604221635884, + "grad_norm": 0.4127797484397888, + "learning_rate": 0.0002, + "loss": 0.9347, + "step": 2160 + }, + { + "epoch": 2.862796833773087, + "grad_norm": 0.8448635339736938, + "learning_rate": 0.0002, + "loss": 1.0459, + "step": 2170 + }, + { + "epoch": 2.87598944591029, + "grad_norm": 0.5669729113578796, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 2180 + }, + { + "epoch": 2.8891820580474934, + "grad_norm": 0.510231077671051, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 2190 + }, + { + "epoch": 2.9023746701846966, + "grad_norm": 0.8072245121002197, + "learning_rate": 0.0002, + "loss": 0.9267, + "step": 2200 + }, + { + "epoch": 2.9155672823219, + "grad_norm": 0.6055923104286194, + "learning_rate": 0.0002, + "loss": 0.9685, + "step": 2210 + }, + { + "epoch": 2.9287598944591027, + "grad_norm": 0.7384416460990906, + "learning_rate": 0.0002, + "loss": 0.9157, + "step": 2220 + }, + { + "epoch": 2.941952506596306, + "grad_norm": 0.4922751784324646, + "learning_rate": 0.0002, + "loss": 0.9712, + "step": 2230 + }, + { + "epoch": 2.955145118733509, + "grad_norm": 0.6039906740188599, + "learning_rate": 0.0002, + "loss": 0.9487, + "step": 2240 + }, + { + "epoch": 2.9683377308707124, + "grad_norm": 0.4751701354980469, + "learning_rate": 0.0002, + "loss": 0.9776, + "step": 2250 + }, + { + "epoch": 2.9815303430079156, + "grad_norm": 0.5698353052139282, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 2260 + }, + { + "epoch": 2.994722955145119, + "grad_norm": 0.893563449382782, + "learning_rate": 0.0002, + "loss": 1.1184, + "step": 2270 + }, + { + "epoch": 3.0, + "eval_loss": 1.2046419382095337, + "eval_runtime": 71.5992, + "eval_samples_per_second": 6.02, + "eval_steps_per_second": 0.754, + "step": 2274 + } + ], + "logging_steps": 10, + "max_steps": 6064, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0523562830777549e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0c0289389bcb2236bf804b0a82c50e67c11a1379 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-2274/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e04722e246164eebe6b4f70dd50fe946c936a2bd25edd75bcedcfd4c7ff9474e +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..68a9aa647ef9d5d0368d1be4701a7e99db1dce05 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec1cbc4628b3fee3f1829d92255ffddc551acb3e80520fa386a6a0dd03eeee5 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c125efae66552828ac437d8a833002572d1e9179 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f02935c5cf306e25d9ca300b5c56058666be776ec46e54e9941227d9a294b306 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..feeb6cceb904d9085716d72732ed849ecc10014f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f9653dfeb17315af1012d18ce6b4eb4ca95cc7a4f59f3b8dc429d007d4017c7 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5d8f9a9803cd54cdbbec1a9ed0033598c77b676 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2caab0cf3fae0f533d57c0fc3a50ce930197ee3c8baeeb49aacb2ff5ef6f469b +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c359d8282e4c07fa239200ceb1026e6093d4e6d4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/trainer_state.json @@ -0,0 +1,2186 @@ +{ + "best_metric": 1.1874967813491821, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 3032, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013192612137203167, + "grad_norm": 0.7545632123947144, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 10 + }, + { + "epoch": 0.026385224274406333, + "grad_norm": 0.5787661075592041, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 20 + }, + { + "epoch": 0.0395778364116095, + "grad_norm": 0.8616093993186951, + "learning_rate": 0.0002, + "loss": 1.3906, + "step": 30 + }, + { + "epoch": 0.052770448548812667, + "grad_norm": 0.42088547348976135, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 40 + }, + { + "epoch": 0.06596306068601583, + "grad_norm": 0.47704678773880005, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 50 + }, + { + "epoch": 0.079155672823219, + "grad_norm": 0.5763994455337524, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 60 + }, + { + "epoch": 0.09234828496042216, + "grad_norm": 0.4579846262931824, + "learning_rate": 0.0002, + "loss": 1.2268, + "step": 70 + }, + { + "epoch": 0.10554089709762533, + "grad_norm": 0.46623846888542175, + "learning_rate": 0.0002, + "loss": 1.415, + "step": 80 + }, + { + "epoch": 0.11873350923482849, + "grad_norm": 0.4206956624984741, + "learning_rate": 0.0002, + "loss": 1.2849, + "step": 90 + }, + { + "epoch": 0.13192612137203166, + "grad_norm": 0.41896629333496094, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 100 + }, + { + "epoch": 0.14511873350923482, + "grad_norm": 0.3459089398384094, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 110 + }, + { + "epoch": 0.158311345646438, + "grad_norm": 0.4587327837944031, + "learning_rate": 0.0002, + "loss": 1.2858, + "step": 120 + }, + { + "epoch": 0.17150395778364116, + "grad_norm": 0.433525413274765, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 130 + }, + { + "epoch": 0.18469656992084432, + "grad_norm": 0.39253175258636475, + "learning_rate": 0.0002, + "loss": 1.2269, + "step": 140 + }, + { + "epoch": 0.19788918205804748, + "grad_norm": 0.3602290749549866, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 150 + }, + { + "epoch": 0.21108179419525067, + "grad_norm": 0.41160839796066284, + "learning_rate": 0.0002, + "loss": 1.2834, + "step": 160 + }, + { + "epoch": 0.22427440633245382, + "grad_norm": 0.7213630080223083, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 170 + }, + { + "epoch": 0.23746701846965698, + "grad_norm": 0.39086055755615234, + "learning_rate": 0.0002, + "loss": 1.2215, + "step": 180 + }, + { + "epoch": 0.25065963060686014, + "grad_norm": 0.4465520977973938, + "learning_rate": 0.0002, + "loss": 1.3315, + "step": 190 + }, + { + "epoch": 0.2638522427440633, + "grad_norm": 1.814679741859436, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 200 + }, + { + "epoch": 0.2770448548812665, + "grad_norm": 0.5026423931121826, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 210 + }, + { + "epoch": 0.29023746701846964, + "grad_norm": 0.4156292974948883, + "learning_rate": 0.0002, + "loss": 1.3306, + "step": 220 + }, + { + "epoch": 0.3034300791556728, + "grad_norm": 0.40813493728637695, + "learning_rate": 0.0002, + "loss": 1.266, + "step": 230 + }, + { + "epoch": 0.316622691292876, + "grad_norm": 0.3304787874221802, + "learning_rate": 0.0002, + "loss": 1.1533, + "step": 240 + }, + { + "epoch": 0.32981530343007914, + "grad_norm": 0.46139976382255554, + "learning_rate": 0.0002, + "loss": 1.3154, + "step": 250 + }, + { + "epoch": 0.34300791556728233, + "grad_norm": 0.37518271803855896, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 260 + }, + { + "epoch": 0.3562005277044855, + "grad_norm": 0.35586467385292053, + "learning_rate": 0.0002, + "loss": 1.2004, + "step": 270 + }, + { + "epoch": 0.36939313984168864, + "grad_norm": 0.32441186904907227, + "learning_rate": 0.0002, + "loss": 1.2609, + "step": 280 + }, + { + "epoch": 0.38258575197889183, + "grad_norm": 0.3198683261871338, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 290 + }, + { + "epoch": 0.39577836411609496, + "grad_norm": 0.33663108944892883, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 300 + }, + { + "epoch": 0.40897097625329815, + "grad_norm": 0.3711244761943817, + "learning_rate": 0.0002, + "loss": 1.1702, + "step": 310 + }, + { + "epoch": 0.42216358839050133, + "grad_norm": 0.3209651708602905, + "learning_rate": 0.0002, + "loss": 1.0871, + "step": 320 + }, + { + "epoch": 0.43535620052770446, + "grad_norm": 0.5152716040611267, + "learning_rate": 0.0002, + "loss": 1.2728, + "step": 330 + }, + { + "epoch": 0.44854881266490765, + "grad_norm": 0.5431376695632935, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 340 + }, + { + "epoch": 0.46174142480211083, + "grad_norm": 0.3069997727870941, + "learning_rate": 0.0002, + "loss": 1.1344, + "step": 350 + }, + { + "epoch": 0.47493403693931396, + "grad_norm": 0.34260064363479614, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 360 + }, + { + "epoch": 0.48812664907651715, + "grad_norm": 0.345653235912323, + "learning_rate": 0.0002, + "loss": 1.3161, + "step": 370 + }, + { + "epoch": 0.5013192612137203, + "grad_norm": 0.46222734451293945, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 380 + }, + { + "epoch": 0.5145118733509235, + "grad_norm": 0.27301734685897827, + "learning_rate": 0.0002, + "loss": 1.0616, + "step": 390 + }, + { + "epoch": 0.5277044854881267, + "grad_norm": 0.29048439860343933, + "learning_rate": 0.0002, + "loss": 1.1944, + "step": 400 + }, + { + "epoch": 0.5408970976253298, + "grad_norm": 0.32927802205085754, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 410 + }, + { + "epoch": 0.554089709762533, + "grad_norm": 0.3336397409439087, + "learning_rate": 0.0002, + "loss": 1.2362, + "step": 420 + }, + { + "epoch": 0.5672823218997362, + "grad_norm": 0.4007597267627716, + "learning_rate": 0.0002, + "loss": 1.1445, + "step": 430 + }, + { + "epoch": 0.5804749340369393, + "grad_norm": 0.36144956946372986, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 440 + }, + { + "epoch": 0.5936675461741425, + "grad_norm": 0.6331009864807129, + "learning_rate": 0.0002, + "loss": 1.1604, + "step": 450 + }, + { + "epoch": 0.6068601583113457, + "grad_norm": 0.41469088196754456, + "learning_rate": 0.0002, + "loss": 1.1692, + "step": 460 + }, + { + "epoch": 0.6200527704485488, + "grad_norm": 0.4388185143470764, + "learning_rate": 0.0002, + "loss": 1.2372, + "step": 470 + }, + { + "epoch": 0.633245382585752, + "grad_norm": 0.3738141655921936, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 480 + }, + { + "epoch": 0.6464379947229552, + "grad_norm": 0.7212023138999939, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 490 + }, + { + "epoch": 0.6596306068601583, + "grad_norm": 0.2972351014614105, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 500 + }, + { + "epoch": 0.6728232189973615, + "grad_norm": 0.45293179154396057, + "learning_rate": 0.0002, + "loss": 1.1739, + "step": 510 + }, + { + "epoch": 0.6860158311345647, + "grad_norm": 0.4319860637187958, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 520 + }, + { + "epoch": 0.6992084432717678, + "grad_norm": 0.3050215542316437, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 530 + }, + { + "epoch": 0.712401055408971, + "grad_norm": 0.3552611172199249, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 540 + }, + { + "epoch": 0.7255936675461742, + "grad_norm": 0.3631151020526886, + "learning_rate": 0.0002, + "loss": 1.2546, + "step": 550 + }, + { + "epoch": 0.7387862796833773, + "grad_norm": 0.28177931904792786, + "learning_rate": 0.0002, + "loss": 1.1371, + "step": 560 + }, + { + "epoch": 0.7519788918205804, + "grad_norm": 0.359764039516449, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 570 + }, + { + "epoch": 0.7651715039577837, + "grad_norm": 0.3970327377319336, + "learning_rate": 0.0002, + "loss": 1.1356, + "step": 580 + }, + { + "epoch": 0.7783641160949868, + "grad_norm": 0.3541001081466675, + "learning_rate": 0.0002, + "loss": 1.309, + "step": 590 + }, + { + "epoch": 0.7915567282321899, + "grad_norm": 0.3478573262691498, + "learning_rate": 0.0002, + "loss": 1.2061, + "step": 600 + }, + { + "epoch": 0.8047493403693932, + "grad_norm": 0.3900321424007416, + "learning_rate": 0.0002, + "loss": 1.1864, + "step": 610 + }, + { + "epoch": 0.8179419525065963, + "grad_norm": 0.3443238437175751, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 620 + }, + { + "epoch": 0.8311345646437994, + "grad_norm": 0.44238781929016113, + "learning_rate": 0.0002, + "loss": 1.3232, + "step": 630 + }, + { + "epoch": 0.8443271767810027, + "grad_norm": 0.36339467763900757, + "learning_rate": 0.0002, + "loss": 1.1247, + "step": 640 + }, + { + "epoch": 0.8575197889182058, + "grad_norm": 0.6243070363998413, + "learning_rate": 0.0002, + "loss": 1.1049, + "step": 650 + }, + { + "epoch": 0.8707124010554089, + "grad_norm": 0.3209173381328583, + "learning_rate": 0.0002, + "loss": 1.1943, + "step": 660 + }, + { + "epoch": 0.8839050131926122, + "grad_norm": 0.35017991065979004, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 670 + }, + { + "epoch": 0.8970976253298153, + "grad_norm": 0.3247159421443939, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 680 + }, + { + "epoch": 0.9102902374670184, + "grad_norm": 0.4091894030570984, + "learning_rate": 0.0002, + "loss": 1.2298, + "step": 690 + }, + { + "epoch": 0.9234828496042217, + "grad_norm": 0.3975585997104645, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 700 + }, + { + "epoch": 0.9366754617414248, + "grad_norm": 0.3666245937347412, + "learning_rate": 0.0002, + "loss": 1.281, + "step": 710 + }, + { + "epoch": 0.9498680738786279, + "grad_norm": 0.45216917991638184, + "learning_rate": 0.0002, + "loss": 1.2855, + "step": 720 + }, + { + "epoch": 0.9630606860158312, + "grad_norm": 0.36108118295669556, + "learning_rate": 0.0002, + "loss": 1.2217, + "step": 730 + }, + { + "epoch": 0.9762532981530343, + "grad_norm": 0.44550251960754395, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 740 + }, + { + "epoch": 0.9894459102902374, + "grad_norm": 0.29801255464553833, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 750 + }, + { + "epoch": 1.0, + "eval_loss": 1.203244686126709, + "eval_runtime": 76.0457, + "eval_samples_per_second": 5.668, + "eval_steps_per_second": 0.71, + "step": 758 + }, + { + "epoch": 1.0026385224274406, + "grad_norm": 0.4096551239490509, + "learning_rate": 0.0002, + "loss": 1.0748, + "step": 760 + }, + { + "epoch": 1.0158311345646438, + "grad_norm": 0.2649582326412201, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 770 + }, + { + "epoch": 1.029023746701847, + "grad_norm": 0.3100722134113312, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 780 + }, + { + "epoch": 1.04221635883905, + "grad_norm": 0.3911755383014679, + "learning_rate": 0.0002, + "loss": 1.1736, + "step": 790 + }, + { + "epoch": 1.0554089709762533, + "grad_norm": 0.4600953757762909, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 800 + }, + { + "epoch": 1.0686015831134565, + "grad_norm": 0.28671619296073914, + "learning_rate": 0.0002, + "loss": 1.2869, + "step": 810 + }, + { + "epoch": 1.0817941952506596, + "grad_norm": 0.47282642126083374, + "learning_rate": 0.0002, + "loss": 1.1441, + "step": 820 + }, + { + "epoch": 1.0949868073878628, + "grad_norm": 0.690073549747467, + "learning_rate": 0.0002, + "loss": 1.1368, + "step": 830 + }, + { + "epoch": 1.108179419525066, + "grad_norm": 0.7317902445793152, + "learning_rate": 0.0002, + "loss": 0.9944, + "step": 840 + }, + { + "epoch": 1.121372031662269, + "grad_norm": 0.44215938448905945, + "learning_rate": 0.0002, + "loss": 1.108, + "step": 850 + }, + { + "epoch": 1.1345646437994723, + "grad_norm": 0.33875149488449097, + "learning_rate": 0.0002, + "loss": 1.0558, + "step": 860 + }, + { + "epoch": 1.1477572559366755, + "grad_norm": 0.3700002431869507, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 870 + }, + { + "epoch": 1.1609498680738786, + "grad_norm": 0.41173291206359863, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 880 + }, + { + "epoch": 1.1741424802110818, + "grad_norm": 0.5253589749336243, + "learning_rate": 0.0002, + "loss": 1.0312, + "step": 890 + }, + { + "epoch": 1.187335092348285, + "grad_norm": 0.3912237286567688, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 900 + }, + { + "epoch": 1.200527704485488, + "grad_norm": 0.40990331768989563, + "learning_rate": 0.0002, + "loss": 0.9434, + "step": 910 + }, + { + "epoch": 1.2137203166226913, + "grad_norm": 0.40377968549728394, + "learning_rate": 0.0002, + "loss": 1.0493, + "step": 920 + }, + { + "epoch": 1.2269129287598945, + "grad_norm": 0.4605846405029297, + "learning_rate": 0.0002, + "loss": 1.1496, + "step": 930 + }, + { + "epoch": 1.2401055408970976, + "grad_norm": 0.31564897298812866, + "learning_rate": 0.0002, + "loss": 1.1173, + "step": 940 + }, + { + "epoch": 1.2532981530343008, + "grad_norm": 0.39808550477027893, + "learning_rate": 0.0002, + "loss": 1.0547, + "step": 950 + }, + { + "epoch": 1.266490765171504, + "grad_norm": 0.3762115240097046, + "learning_rate": 0.0002, + "loss": 1.1367, + "step": 960 + }, + { + "epoch": 1.279683377308707, + "grad_norm": 0.4174984097480774, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 970 + }, + { + "epoch": 1.2928759894459103, + "grad_norm": 0.5263054966926575, + "learning_rate": 0.0002, + "loss": 1.1327, + "step": 980 + }, + { + "epoch": 1.3060686015831133, + "grad_norm": 0.41673699021339417, + "learning_rate": 0.0002, + "loss": 1.0339, + "step": 990 + }, + { + "epoch": 1.3192612137203166, + "grad_norm": 0.9613684415817261, + "learning_rate": 0.0002, + "loss": 1.1198, + "step": 1000 + }, + { + "epoch": 1.3324538258575198, + "grad_norm": 0.3690216839313507, + "learning_rate": 0.0002, + "loss": 1.0444, + "step": 1010 + }, + { + "epoch": 1.345646437994723, + "grad_norm": 0.521821141242981, + "learning_rate": 0.0002, + "loss": 1.0473, + "step": 1020 + }, + { + "epoch": 1.358839050131926, + "grad_norm": 0.3353094160556793, + "learning_rate": 0.0002, + "loss": 1.1065, + "step": 1030 + }, + { + "epoch": 1.3720316622691293, + "grad_norm": 0.3843843936920166, + "learning_rate": 0.0002, + "loss": 1.1286, + "step": 1040 + }, + { + "epoch": 1.3852242744063323, + "grad_norm": 0.372514545917511, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 1050 + }, + { + "epoch": 1.3984168865435356, + "grad_norm": 0.34537771344184875, + "learning_rate": 0.0002, + "loss": 1.0041, + "step": 1060 + }, + { + "epoch": 1.4116094986807388, + "grad_norm": 0.45349085330963135, + "learning_rate": 0.0002, + "loss": 1.0251, + "step": 1070 + }, + { + "epoch": 1.424802110817942, + "grad_norm": 0.5120177268981934, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 1080 + }, + { + "epoch": 1.437994722955145, + "grad_norm": 0.42800238728523254, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 1090 + }, + { + "epoch": 1.4511873350923483, + "grad_norm": 0.343832790851593, + "learning_rate": 0.0002, + "loss": 1.0734, + "step": 1100 + }, + { + "epoch": 1.4643799472295513, + "grad_norm": 0.3829841911792755, + "learning_rate": 0.0002, + "loss": 1.1286, + "step": 1110 + }, + { + "epoch": 1.4775725593667546, + "grad_norm": 0.4289931058883667, + "learning_rate": 0.0002, + "loss": 1.1268, + "step": 1120 + }, + { + "epoch": 1.4907651715039578, + "grad_norm": 0.42750850319862366, + "learning_rate": 0.0002, + "loss": 1.0676, + "step": 1130 + }, + { + "epoch": 1.503957783641161, + "grad_norm": 0.34328413009643555, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 1140 + }, + { + "epoch": 1.517150395778364, + "grad_norm": 0.349096417427063, + "learning_rate": 0.0002, + "loss": 1.0863, + "step": 1150 + }, + { + "epoch": 1.5303430079155673, + "grad_norm": 0.7700717449188232, + "learning_rate": 0.0002, + "loss": 1.1462, + "step": 1160 + }, + { + "epoch": 1.5435356200527703, + "grad_norm": 0.39294949173927307, + "learning_rate": 0.0002, + "loss": 0.9528, + "step": 1170 + }, + { + "epoch": 1.5567282321899736, + "grad_norm": 0.36173608899116516, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 1180 + }, + { + "epoch": 1.5699208443271768, + "grad_norm": 0.6034277677536011, + "learning_rate": 0.0002, + "loss": 1.1508, + "step": 1190 + }, + { + "epoch": 1.58311345646438, + "grad_norm": 0.36694103479385376, + "learning_rate": 0.0002, + "loss": 1.0105, + "step": 1200 + }, + { + "epoch": 1.596306068601583, + "grad_norm": 0.4727209508419037, + "learning_rate": 0.0002, + "loss": 1.1479, + "step": 1210 + }, + { + "epoch": 1.6094986807387863, + "grad_norm": 0.6482883095741272, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 1220 + }, + { + "epoch": 1.6226912928759893, + "grad_norm": 0.5238035917282104, + "learning_rate": 0.0002, + "loss": 1.1405, + "step": 1230 + }, + { + "epoch": 1.6358839050131926, + "grad_norm": 0.4812222421169281, + "learning_rate": 0.0002, + "loss": 1.0596, + "step": 1240 + }, + { + "epoch": 1.6490765171503958, + "grad_norm": 0.7131702303886414, + "learning_rate": 0.0002, + "loss": 1.2729, + "step": 1250 + }, + { + "epoch": 1.662269129287599, + "grad_norm": 0.3803327977657318, + "learning_rate": 0.0002, + "loss": 0.9832, + "step": 1260 + }, + { + "epoch": 1.675461741424802, + "grad_norm": 0.3745088577270508, + "learning_rate": 0.0002, + "loss": 1.1433, + "step": 1270 + }, + { + "epoch": 1.6886543535620053, + "grad_norm": 0.4427378475666046, + "learning_rate": 0.0002, + "loss": 1.1018, + "step": 1280 + }, + { + "epoch": 1.7018469656992083, + "grad_norm": 0.797478973865509, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 1290 + }, + { + "epoch": 1.7150395778364116, + "grad_norm": 0.503620982170105, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 1300 + }, + { + "epoch": 1.7282321899736148, + "grad_norm": 0.4132426381111145, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 1310 + }, + { + "epoch": 1.741424802110818, + "grad_norm": 0.41811656951904297, + "learning_rate": 0.0002, + "loss": 1.172, + "step": 1320 + }, + { + "epoch": 1.754617414248021, + "grad_norm": 0.40647849440574646, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1330 + }, + { + "epoch": 1.767810026385224, + "grad_norm": 0.42138347029685974, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 1340 + }, + { + "epoch": 1.7810026385224274, + "grad_norm": 0.46523579955101013, + "learning_rate": 0.0002, + "loss": 0.9955, + "step": 1350 + }, + { + "epoch": 1.7941952506596306, + "grad_norm": 0.39760419726371765, + "learning_rate": 0.0002, + "loss": 1.0307, + "step": 1360 + }, + { + "epoch": 1.8073878627968338, + "grad_norm": 0.37993717193603516, + "learning_rate": 0.0002, + "loss": 1.1218, + "step": 1370 + }, + { + "epoch": 1.820580474934037, + "grad_norm": 0.5404181480407715, + "learning_rate": 0.0002, + "loss": 1.0921, + "step": 1380 + }, + { + "epoch": 1.83377308707124, + "grad_norm": 0.4385245740413666, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1390 + }, + { + "epoch": 1.8469656992084431, + "grad_norm": 0.529797375202179, + "learning_rate": 0.0002, + "loss": 1.1175, + "step": 1400 + }, + { + "epoch": 1.8601583113456464, + "grad_norm": 0.481567919254303, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 1410 + }, + { + "epoch": 1.8733509234828496, + "grad_norm": 0.34787362813949585, + "learning_rate": 0.0002, + "loss": 1.0581, + "step": 1420 + }, + { + "epoch": 1.8865435356200528, + "grad_norm": 0.6402362585067749, + "learning_rate": 0.0002, + "loss": 1.0536, + "step": 1430 + }, + { + "epoch": 1.899736147757256, + "grad_norm": 0.3461322784423828, + "learning_rate": 0.0002, + "loss": 1.0787, + "step": 1440 + }, + { + "epoch": 1.912928759894459, + "grad_norm": 0.44005653262138367, + "learning_rate": 0.0002, + "loss": 1.0925, + "step": 1450 + }, + { + "epoch": 1.9261213720316621, + "grad_norm": 0.4064280688762665, + "learning_rate": 0.0002, + "loss": 1.0414, + "step": 1460 + }, + { + "epoch": 1.9393139841688654, + "grad_norm": 0.5236523151397705, + "learning_rate": 0.0002, + "loss": 1.0608, + "step": 1470 + }, + { + "epoch": 1.9525065963060686, + "grad_norm": 0.41030219197273254, + "learning_rate": 0.0002, + "loss": 1.0572, + "step": 1480 + }, + { + "epoch": 1.9656992084432718, + "grad_norm": 0.39805835485458374, + "learning_rate": 0.0002, + "loss": 1.1204, + "step": 1490 + }, + { + "epoch": 1.978891820580475, + "grad_norm": 0.42974501848220825, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 1500 + }, + { + "epoch": 1.992084432717678, + "grad_norm": 0.4688243865966797, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 1510 + }, + { + "epoch": 2.0, + "eval_loss": 1.1874967813491821, + "eval_runtime": 71.9523, + "eval_samples_per_second": 5.99, + "eval_steps_per_second": 0.75, + "step": 1516 + }, + { + "epoch": 2.005277044854881, + "grad_norm": 0.4121631383895874, + "learning_rate": 0.0002, + "loss": 1.0401, + "step": 1520 + }, + { + "epoch": 2.0184696569920844, + "grad_norm": 0.4844197928905487, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 1530 + }, + { + "epoch": 2.0316622691292876, + "grad_norm": 0.45408546924591064, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 1540 + }, + { + "epoch": 2.044854881266491, + "grad_norm": 0.48662951588630676, + "learning_rate": 0.0002, + "loss": 1.0399, + "step": 1550 + }, + { + "epoch": 2.058047493403694, + "grad_norm": 0.7195899486541748, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 1560 + }, + { + "epoch": 2.0712401055408973, + "grad_norm": 0.5071077346801758, + "learning_rate": 0.0002, + "loss": 0.9486, + "step": 1570 + }, + { + "epoch": 2.0844327176781, + "grad_norm": 0.7473958730697632, + "learning_rate": 0.0002, + "loss": 1.0055, + "step": 1580 + }, + { + "epoch": 2.0976253298153034, + "grad_norm": 0.5509232878684998, + "learning_rate": 0.0002, + "loss": 0.8309, + "step": 1590 + }, + { + "epoch": 2.1108179419525066, + "grad_norm": 0.5108042359352112, + "learning_rate": 0.0002, + "loss": 0.9181, + "step": 1600 + }, + { + "epoch": 2.12401055408971, + "grad_norm": 0.42331448197364807, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 1610 + }, + { + "epoch": 2.137203166226913, + "grad_norm": 0.46621623635292053, + "learning_rate": 0.0002, + "loss": 0.9359, + "step": 1620 + }, + { + "epoch": 2.150395778364116, + "grad_norm": 0.43802836537361145, + "learning_rate": 0.0002, + "loss": 0.9065, + "step": 1630 + }, + { + "epoch": 2.163588390501319, + "grad_norm": 0.49908021092414856, + "learning_rate": 0.0002, + "loss": 0.9375, + "step": 1640 + }, + { + "epoch": 2.1767810026385224, + "grad_norm": 0.4195636808872223, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 1650 + }, + { + "epoch": 2.1899736147757256, + "grad_norm": 0.49515822529792786, + "learning_rate": 0.0002, + "loss": 0.8501, + "step": 1660 + }, + { + "epoch": 2.203166226912929, + "grad_norm": 0.4607589542865753, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 1670 + }, + { + "epoch": 2.216358839050132, + "grad_norm": 0.4489196836948395, + "learning_rate": 0.0002, + "loss": 1.0206, + "step": 1680 + }, + { + "epoch": 2.229551451187335, + "grad_norm": 0.49300864338874817, + "learning_rate": 0.0002, + "loss": 0.9657, + "step": 1690 + }, + { + "epoch": 2.242744063324538, + "grad_norm": 0.6624954342842102, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 1700 + }, + { + "epoch": 2.2559366754617414, + "grad_norm": 0.8391500115394592, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 1710 + }, + { + "epoch": 2.2691292875989446, + "grad_norm": 0.5193073749542236, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 1720 + }, + { + "epoch": 2.282321899736148, + "grad_norm": 0.6180613040924072, + "learning_rate": 0.0002, + "loss": 0.9979, + "step": 1730 + }, + { + "epoch": 2.295514511873351, + "grad_norm": 0.591191291809082, + "learning_rate": 0.0002, + "loss": 0.9579, + "step": 1740 + }, + { + "epoch": 2.308707124010554, + "grad_norm": 0.546897828578949, + "learning_rate": 0.0002, + "loss": 0.8879, + "step": 1750 + }, + { + "epoch": 2.321899736147757, + "grad_norm": 0.5470401644706726, + "learning_rate": 0.0002, + "loss": 0.9321, + "step": 1760 + }, + { + "epoch": 2.3350923482849604, + "grad_norm": 0.4590282738208771, + "learning_rate": 0.0002, + "loss": 0.9104, + "step": 1770 + }, + { + "epoch": 2.3482849604221636, + "grad_norm": 0.622164785861969, + "learning_rate": 0.0002, + "loss": 1.0384, + "step": 1780 + }, + { + "epoch": 2.361477572559367, + "grad_norm": 0.5753812193870544, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 1790 + }, + { + "epoch": 2.37467018469657, + "grad_norm": 0.47958624362945557, + "learning_rate": 0.0002, + "loss": 0.9569, + "step": 1800 + }, + { + "epoch": 2.387862796833773, + "grad_norm": 0.48042672872543335, + "learning_rate": 0.0002, + "loss": 1.0519, + "step": 1810 + }, + { + "epoch": 2.401055408970976, + "grad_norm": 0.44586366415023804, + "learning_rate": 0.0002, + "loss": 0.9042, + "step": 1820 + }, + { + "epoch": 2.4142480211081794, + "grad_norm": 0.7239416837692261, + "learning_rate": 0.0002, + "loss": 0.9783, + "step": 1830 + }, + { + "epoch": 2.4274406332453826, + "grad_norm": 0.5515341758728027, + "learning_rate": 0.0002, + "loss": 0.8818, + "step": 1840 + }, + { + "epoch": 2.440633245382586, + "grad_norm": 0.6280064582824707, + "learning_rate": 0.0002, + "loss": 0.9503, + "step": 1850 + }, + { + "epoch": 2.453825857519789, + "grad_norm": 0.4832057058811188, + "learning_rate": 0.0002, + "loss": 0.8943, + "step": 1860 + }, + { + "epoch": 2.467018469656992, + "grad_norm": 0.5789321064949036, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 1870 + }, + { + "epoch": 2.480211081794195, + "grad_norm": 0.48491886258125305, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 1880 + }, + { + "epoch": 2.4934036939313984, + "grad_norm": 0.532365620136261, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 1890 + }, + { + "epoch": 2.5065963060686016, + "grad_norm": 0.7087852954864502, + "learning_rate": 0.0002, + "loss": 0.9734, + "step": 1900 + }, + { + "epoch": 2.519788918205805, + "grad_norm": 0.48157402873039246, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 1910 + }, + { + "epoch": 2.532981530343008, + "grad_norm": 0.5886041522026062, + "learning_rate": 0.0002, + "loss": 0.9851, + "step": 1920 + }, + { + "epoch": 2.5461741424802113, + "grad_norm": 0.6332622766494751, + "learning_rate": 0.0002, + "loss": 1.0144, + "step": 1930 + }, + { + "epoch": 2.559366754617414, + "grad_norm": 0.5463117957115173, + "learning_rate": 0.0002, + "loss": 0.9516, + "step": 1940 + }, + { + "epoch": 2.5725593667546174, + "grad_norm": 0.5432228446006775, + "learning_rate": 0.0002, + "loss": 0.9373, + "step": 1950 + }, + { + "epoch": 2.5857519788918206, + "grad_norm": 0.5929186940193176, + "learning_rate": 0.0002, + "loss": 0.8974, + "step": 1960 + }, + { + "epoch": 2.598944591029024, + "grad_norm": 0.5120641589164734, + "learning_rate": 0.0002, + "loss": 1.0062, + "step": 1970 + }, + { + "epoch": 2.6121372031662267, + "grad_norm": 0.5372339487075806, + "learning_rate": 0.0002, + "loss": 0.9143, + "step": 1980 + }, + { + "epoch": 2.62532981530343, + "grad_norm": 0.5519838929176331, + "learning_rate": 0.0002, + "loss": 1.0786, + "step": 1990 + }, + { + "epoch": 2.638522427440633, + "grad_norm": 0.7304037809371948, + "learning_rate": 0.0002, + "loss": 1.021, + "step": 2000 + }, + { + "epoch": 2.6517150395778364, + "grad_norm": 0.6182340979576111, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 2010 + }, + { + "epoch": 2.6649076517150396, + "grad_norm": 0.4874444305896759, + "learning_rate": 0.0002, + "loss": 1.0501, + "step": 2020 + }, + { + "epoch": 2.678100263852243, + "grad_norm": 0.5850239396095276, + "learning_rate": 0.0002, + "loss": 0.9612, + "step": 2030 + }, + { + "epoch": 2.691292875989446, + "grad_norm": 0.6495311856269836, + "learning_rate": 0.0002, + "loss": 0.9808, + "step": 2040 + }, + { + "epoch": 2.7044854881266494, + "grad_norm": 1.002830147743225, + "learning_rate": 0.0002, + "loss": 1.0187, + "step": 2050 + }, + { + "epoch": 2.717678100263852, + "grad_norm": 0.49076753854751587, + "learning_rate": 0.0002, + "loss": 1.0386, + "step": 2060 + }, + { + "epoch": 2.7308707124010554, + "grad_norm": 0.4736326336860657, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 2070 + }, + { + "epoch": 2.7440633245382586, + "grad_norm": 0.5527601838111877, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 2080 + }, + { + "epoch": 2.757255936675462, + "grad_norm": 0.7295718193054199, + "learning_rate": 0.0002, + "loss": 0.9149, + "step": 2090 + }, + { + "epoch": 2.7704485488126647, + "grad_norm": 0.5437536835670471, + "learning_rate": 0.0002, + "loss": 0.9032, + "step": 2100 + }, + { + "epoch": 2.783641160949868, + "grad_norm": 0.5997128486633301, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2110 + }, + { + "epoch": 2.796833773087071, + "grad_norm": 0.6498191356658936, + "learning_rate": 0.0002, + "loss": 0.976, + "step": 2120 + }, + { + "epoch": 2.8100263852242744, + "grad_norm": 0.5237268805503845, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 2130 + }, + { + "epoch": 2.8232189973614776, + "grad_norm": 0.6033027172088623, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 2140 + }, + { + "epoch": 2.836411609498681, + "grad_norm": 0.6077138781547546, + "learning_rate": 0.0002, + "loss": 0.9625, + "step": 2150 + }, + { + "epoch": 2.849604221635884, + "grad_norm": 0.4127797484397888, + "learning_rate": 0.0002, + "loss": 0.9347, + "step": 2160 + }, + { + "epoch": 2.862796833773087, + "grad_norm": 0.8448635339736938, + "learning_rate": 0.0002, + "loss": 1.0459, + "step": 2170 + }, + { + "epoch": 2.87598944591029, + "grad_norm": 0.5669729113578796, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 2180 + }, + { + "epoch": 2.8891820580474934, + "grad_norm": 0.510231077671051, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 2190 + }, + { + "epoch": 2.9023746701846966, + "grad_norm": 0.8072245121002197, + "learning_rate": 0.0002, + "loss": 0.9267, + "step": 2200 + }, + { + "epoch": 2.9155672823219, + "grad_norm": 0.6055923104286194, + "learning_rate": 0.0002, + "loss": 0.9685, + "step": 2210 + }, + { + "epoch": 2.9287598944591027, + "grad_norm": 0.7384416460990906, + "learning_rate": 0.0002, + "loss": 0.9157, + "step": 2220 + }, + { + "epoch": 2.941952506596306, + "grad_norm": 0.4922751784324646, + "learning_rate": 0.0002, + "loss": 0.9712, + "step": 2230 + }, + { + "epoch": 2.955145118733509, + "grad_norm": 0.6039906740188599, + "learning_rate": 0.0002, + "loss": 0.9487, + "step": 2240 + }, + { + "epoch": 2.9683377308707124, + "grad_norm": 0.4751701354980469, + "learning_rate": 0.0002, + "loss": 0.9776, + "step": 2250 + }, + { + "epoch": 2.9815303430079156, + "grad_norm": 0.5698353052139282, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 2260 + }, + { + "epoch": 2.994722955145119, + "grad_norm": 0.893563449382782, + "learning_rate": 0.0002, + "loss": 1.1184, + "step": 2270 + }, + { + "epoch": 3.0, + "eval_loss": 1.2046419382095337, + "eval_runtime": 71.5992, + "eval_samples_per_second": 6.02, + "eval_steps_per_second": 0.754, + "step": 2274 + }, + { + "epoch": 3.007915567282322, + "grad_norm": 0.41119325160980225, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 2280 + }, + { + "epoch": 3.021108179419525, + "grad_norm": 0.8169420957565308, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 2290 + }, + { + "epoch": 3.034300791556728, + "grad_norm": 0.6033818125724792, + "learning_rate": 0.0002, + "loss": 0.794, + "step": 2300 + }, + { + "epoch": 3.0474934036939314, + "grad_norm": 0.9600058197975159, + "learning_rate": 0.0002, + "loss": 0.7607, + "step": 2310 + }, + { + "epoch": 3.0606860158311346, + "grad_norm": 0.5859250426292419, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 2320 + }, + { + "epoch": 3.073878627968338, + "grad_norm": 0.6758618950843811, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 2330 + }, + { + "epoch": 3.0870712401055407, + "grad_norm": 0.8407140970230103, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 2340 + }, + { + "epoch": 3.100263852242744, + "grad_norm": 0.767779529094696, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 2350 + }, + { + "epoch": 3.113456464379947, + "grad_norm": 0.5572896599769592, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 2360 + }, + { + "epoch": 3.1266490765171504, + "grad_norm": 0.5908368825912476, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 2370 + }, + { + "epoch": 3.1398416886543536, + "grad_norm": 0.8047826290130615, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 2380 + }, + { + "epoch": 3.153034300791557, + "grad_norm": 0.8041718006134033, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 2390 + }, + { + "epoch": 3.16622691292876, + "grad_norm": 0.57078617811203, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 2400 + }, + { + "epoch": 3.179419525065963, + "grad_norm": 0.5125322937965393, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 2410 + }, + { + "epoch": 3.192612137203166, + "grad_norm": 0.6356934309005737, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 2420 + }, + { + "epoch": 3.2058047493403694, + "grad_norm": 1.0129680633544922, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 2430 + }, + { + "epoch": 3.2189973614775726, + "grad_norm": 0.8104226589202881, + "learning_rate": 0.0002, + "loss": 0.8316, + "step": 2440 + }, + { + "epoch": 3.232189973614776, + "grad_norm": 0.7276079058647156, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 2450 + }, + { + "epoch": 3.2453825857519787, + "grad_norm": 0.9753884077072144, + "learning_rate": 0.0002, + "loss": 0.8183, + "step": 2460 + }, + { + "epoch": 3.258575197889182, + "grad_norm": 0.9753183722496033, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 2470 + }, + { + "epoch": 3.271767810026385, + "grad_norm": 0.6791225075721741, + "learning_rate": 0.0002, + "loss": 0.8815, + "step": 2480 + }, + { + "epoch": 3.2849604221635884, + "grad_norm": 0.6797150373458862, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 2490 + }, + { + "epoch": 3.2981530343007917, + "grad_norm": 0.8107194900512695, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 2500 + }, + { + "epoch": 3.311345646437995, + "grad_norm": 0.5878375172615051, + "learning_rate": 0.0002, + "loss": 0.7869, + "step": 2510 + }, + { + "epoch": 3.324538258575198, + "grad_norm": 0.5882975459098816, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 2520 + }, + { + "epoch": 3.337730870712401, + "grad_norm": 0.6180013418197632, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 2530 + }, + { + "epoch": 3.350923482849604, + "grad_norm": 1.0008151531219482, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 2540 + }, + { + "epoch": 3.3641160949868074, + "grad_norm": 0.6404656767845154, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 2550 + }, + { + "epoch": 3.3773087071240107, + "grad_norm": 0.8481354117393494, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 2560 + }, + { + "epoch": 3.390501319261214, + "grad_norm": 0.8068035244941711, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 2570 + }, + { + "epoch": 3.4036939313984167, + "grad_norm": 0.7477166056632996, + "learning_rate": 0.0002, + "loss": 0.8621, + "step": 2580 + }, + { + "epoch": 3.41688654353562, + "grad_norm": 0.6202635765075684, + "learning_rate": 0.0002, + "loss": 0.8352, + "step": 2590 + }, + { + "epoch": 3.430079155672823, + "grad_norm": 0.6981159448623657, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 2600 + }, + { + "epoch": 3.4432717678100264, + "grad_norm": 0.6611084342002869, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 2610 + }, + { + "epoch": 3.4564643799472297, + "grad_norm": 0.5727696418762207, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 2620 + }, + { + "epoch": 3.469656992084433, + "grad_norm": 1.2354545593261719, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 2630 + }, + { + "epoch": 3.4828496042216357, + "grad_norm": 0.6347638368606567, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 2640 + }, + { + "epoch": 3.496042216358839, + "grad_norm": 0.6975704431533813, + "learning_rate": 0.0002, + "loss": 0.8426, + "step": 2650 + }, + { + "epoch": 3.509234828496042, + "grad_norm": 0.6569573879241943, + "learning_rate": 0.0002, + "loss": 0.8773, + "step": 2660 + }, + { + "epoch": 3.5224274406332454, + "grad_norm": 0.6979609131813049, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 2670 + }, + { + "epoch": 3.5356200527704487, + "grad_norm": 0.6287988424301147, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2680 + }, + { + "epoch": 3.5488126649076515, + "grad_norm": 0.8682637214660645, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 2690 + }, + { + "epoch": 3.5620052770448547, + "grad_norm": 0.7062831521034241, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 2700 + }, + { + "epoch": 3.575197889182058, + "grad_norm": 1.0061452388763428, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 2710 + }, + { + "epoch": 3.588390501319261, + "grad_norm": 0.719097375869751, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 2720 + }, + { + "epoch": 3.6015831134564644, + "grad_norm": 0.7583496570587158, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 2730 + }, + { + "epoch": 3.6147757255936677, + "grad_norm": 0.7543531060218811, + "learning_rate": 0.0002, + "loss": 0.91, + "step": 2740 + }, + { + "epoch": 3.627968337730871, + "grad_norm": 0.8873646855354309, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2750 + }, + { + "epoch": 3.641160949868074, + "grad_norm": 1.0657562017440796, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 2760 + }, + { + "epoch": 3.654353562005277, + "grad_norm": 0.8641113638877869, + "learning_rate": 0.0002, + "loss": 0.8291, + "step": 2770 + }, + { + "epoch": 3.66754617414248, + "grad_norm": 0.6620645523071289, + "learning_rate": 0.0002, + "loss": 0.8302, + "step": 2780 + }, + { + "epoch": 3.6807387862796834, + "grad_norm": 0.6919541954994202, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 2790 + }, + { + "epoch": 3.6939313984168867, + "grad_norm": 0.7305743098258972, + "learning_rate": 0.0002, + "loss": 0.8388, + "step": 2800 + }, + { + "epoch": 3.7071240105540895, + "grad_norm": 0.7464777827262878, + "learning_rate": 0.0002, + "loss": 0.8053, + "step": 2810 + }, + { + "epoch": 3.7203166226912927, + "grad_norm": 0.8067063093185425, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 2820 + }, + { + "epoch": 3.733509234828496, + "grad_norm": 0.7789416313171387, + "learning_rate": 0.0002, + "loss": 0.8259, + "step": 2830 + }, + { + "epoch": 3.746701846965699, + "grad_norm": 0.507529079914093, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 2840 + }, + { + "epoch": 3.7598944591029024, + "grad_norm": 0.6509260535240173, + "learning_rate": 0.0002, + "loss": 0.832, + "step": 2850 + }, + { + "epoch": 3.7730870712401057, + "grad_norm": 0.9141367673873901, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 2860 + }, + { + "epoch": 3.786279683377309, + "grad_norm": 0.7852635979652405, + "learning_rate": 0.0002, + "loss": 0.9436, + "step": 2870 + }, + { + "epoch": 3.7994722955145117, + "grad_norm": 0.5340318083763123, + "learning_rate": 0.0002, + "loss": 0.8842, + "step": 2880 + }, + { + "epoch": 3.812664907651715, + "grad_norm": 0.6246042847633362, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 2890 + }, + { + "epoch": 3.825857519788918, + "grad_norm": 0.7064066529273987, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 2900 + }, + { + "epoch": 3.8390501319261214, + "grad_norm": 0.6144065856933594, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 2910 + }, + { + "epoch": 3.8522427440633247, + "grad_norm": 0.5268424153327942, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 2920 + }, + { + "epoch": 3.8654353562005275, + "grad_norm": 0.9508116841316223, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 2930 + }, + { + "epoch": 3.8786279683377307, + "grad_norm": 0.9133715629577637, + "learning_rate": 0.0002, + "loss": 0.8437, + "step": 2940 + }, + { + "epoch": 3.891820580474934, + "grad_norm": 1.0144646167755127, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 2950 + }, + { + "epoch": 3.905013192612137, + "grad_norm": 0.6397877931594849, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2960 + }, + { + "epoch": 3.9182058047493404, + "grad_norm": 0.734835147857666, + "learning_rate": 0.0002, + "loss": 0.8285, + "step": 2970 + }, + { + "epoch": 3.9313984168865437, + "grad_norm": 0.784853994846344, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 2980 + }, + { + "epoch": 3.944591029023747, + "grad_norm": 0.805831789970398, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 2990 + }, + { + "epoch": 3.9577836411609497, + "grad_norm": 0.6299595236778259, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 3000 + }, + { + "epoch": 3.970976253298153, + "grad_norm": 0.6264058351516724, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3010 + }, + { + "epoch": 3.984168865435356, + "grad_norm": 0.6419739723205566, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 3020 + }, + { + "epoch": 3.9973614775725594, + "grad_norm": 0.7737036943435669, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 3030 + }, + { + "epoch": 4.0, + "eval_loss": 1.2454297542572021, + "eval_runtime": 71.8558, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.752, + "step": 3032 + } + ], + "logging_steps": 10, + "max_steps": 6064, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4031417107703398e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0c0289389bcb2236bf804b0a82c50e67c11a1379 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3032/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e04722e246164eebe6b4f70dd50fe946c936a2bd25edd75bcedcfd4c7ff9474e +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b6dc93e614d45dfac3bb3bae829367d921dac43c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0abb8ce220189985e1d92cd920ffac58338081eb5a9b06dd4e2a9f59804fb63 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f41d7d6a310d6583549e78c20544ae0e547476a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:853504d70ffcd4907d6951e8dfe1974e9fb551fb777f7c82dbe48509cc78e7bf +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc1c1945f7947a8a7bd1fdb1fa3fbb4243d80f78 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28b7e36910ca9df80c1bd60717cb32b049b54870c08d539535c378fece8e4068 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ccb27644ac0090ebecaad28777010001d127c22 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89147f06daea8413d82a4b7b928b4c476779123cd562b0a5ce13276f70c01eaa +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..85b28475bcafdcd0ca78912148ba0047fe54e927 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/trainer_state.json @@ -0,0 +1,2726 @@ +{ + "best_metric": 1.1874967813491821, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516", + "epoch": 5.0, + "eval_steps": 10, + "global_step": 3790, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013192612137203167, + "grad_norm": 0.7545632123947144, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 10 + }, + { + "epoch": 0.026385224274406333, + "grad_norm": 0.5787661075592041, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 20 + }, + { + "epoch": 0.0395778364116095, + "grad_norm": 0.8616093993186951, + "learning_rate": 0.0002, + "loss": 1.3906, + "step": 30 + }, + { + "epoch": 0.052770448548812667, + "grad_norm": 0.42088547348976135, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 40 + }, + { + "epoch": 0.06596306068601583, + "grad_norm": 0.47704678773880005, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 50 + }, + { + "epoch": 0.079155672823219, + "grad_norm": 0.5763994455337524, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 60 + }, + { + "epoch": 0.09234828496042216, + "grad_norm": 0.4579846262931824, + "learning_rate": 0.0002, + "loss": 1.2268, + "step": 70 + }, + { + "epoch": 0.10554089709762533, + "grad_norm": 0.46623846888542175, + "learning_rate": 0.0002, + "loss": 1.415, + "step": 80 + }, + { + "epoch": 0.11873350923482849, + "grad_norm": 0.4206956624984741, + "learning_rate": 0.0002, + "loss": 1.2849, + "step": 90 + }, + { + "epoch": 0.13192612137203166, + "grad_norm": 0.41896629333496094, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 100 + }, + { + "epoch": 0.14511873350923482, + "grad_norm": 0.3459089398384094, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 110 + }, + { + "epoch": 0.158311345646438, + "grad_norm": 0.4587327837944031, + "learning_rate": 0.0002, + "loss": 1.2858, + "step": 120 + }, + { + "epoch": 0.17150395778364116, + "grad_norm": 0.433525413274765, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 130 + }, + { + "epoch": 0.18469656992084432, + "grad_norm": 0.39253175258636475, + "learning_rate": 0.0002, + "loss": 1.2269, + "step": 140 + }, + { + "epoch": 0.19788918205804748, + "grad_norm": 0.3602290749549866, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 150 + }, + { + "epoch": 0.21108179419525067, + "grad_norm": 0.41160839796066284, + "learning_rate": 0.0002, + "loss": 1.2834, + "step": 160 + }, + { + "epoch": 0.22427440633245382, + "grad_norm": 0.7213630080223083, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 170 + }, + { + "epoch": 0.23746701846965698, + "grad_norm": 0.39086055755615234, + "learning_rate": 0.0002, + "loss": 1.2215, + "step": 180 + }, + { + "epoch": 0.25065963060686014, + "grad_norm": 0.4465520977973938, + "learning_rate": 0.0002, + "loss": 1.3315, + "step": 190 + }, + { + "epoch": 0.2638522427440633, + "grad_norm": 1.814679741859436, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 200 + }, + { + "epoch": 0.2770448548812665, + "grad_norm": 0.5026423931121826, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 210 + }, + { + "epoch": 0.29023746701846964, + "grad_norm": 0.4156292974948883, + "learning_rate": 0.0002, + "loss": 1.3306, + "step": 220 + }, + { + "epoch": 0.3034300791556728, + "grad_norm": 0.40813493728637695, + "learning_rate": 0.0002, + "loss": 1.266, + "step": 230 + }, + { + "epoch": 0.316622691292876, + "grad_norm": 0.3304787874221802, + "learning_rate": 0.0002, + "loss": 1.1533, + "step": 240 + }, + { + "epoch": 0.32981530343007914, + "grad_norm": 0.46139976382255554, + "learning_rate": 0.0002, + "loss": 1.3154, + "step": 250 + }, + { + "epoch": 0.34300791556728233, + "grad_norm": 0.37518271803855896, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 260 + }, + { + "epoch": 0.3562005277044855, + "grad_norm": 0.35586467385292053, + "learning_rate": 0.0002, + "loss": 1.2004, + "step": 270 + }, + { + "epoch": 0.36939313984168864, + "grad_norm": 0.32441186904907227, + "learning_rate": 0.0002, + "loss": 1.2609, + "step": 280 + }, + { + "epoch": 0.38258575197889183, + "grad_norm": 0.3198683261871338, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 290 + }, + { + "epoch": 0.39577836411609496, + "grad_norm": 0.33663108944892883, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 300 + }, + { + "epoch": 0.40897097625329815, + "grad_norm": 0.3711244761943817, + "learning_rate": 0.0002, + "loss": 1.1702, + "step": 310 + }, + { + "epoch": 0.42216358839050133, + "grad_norm": 0.3209651708602905, + "learning_rate": 0.0002, + "loss": 1.0871, + "step": 320 + }, + { + "epoch": 0.43535620052770446, + "grad_norm": 0.5152716040611267, + "learning_rate": 0.0002, + "loss": 1.2728, + "step": 330 + }, + { + "epoch": 0.44854881266490765, + "grad_norm": 0.5431376695632935, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 340 + }, + { + "epoch": 0.46174142480211083, + "grad_norm": 0.3069997727870941, + "learning_rate": 0.0002, + "loss": 1.1344, + "step": 350 + }, + { + "epoch": 0.47493403693931396, + "grad_norm": 0.34260064363479614, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 360 + }, + { + "epoch": 0.48812664907651715, + "grad_norm": 0.345653235912323, + "learning_rate": 0.0002, + "loss": 1.3161, + "step": 370 + }, + { + "epoch": 0.5013192612137203, + "grad_norm": 0.46222734451293945, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 380 + }, + { + "epoch": 0.5145118733509235, + "grad_norm": 0.27301734685897827, + "learning_rate": 0.0002, + "loss": 1.0616, + "step": 390 + }, + { + "epoch": 0.5277044854881267, + "grad_norm": 0.29048439860343933, + "learning_rate": 0.0002, + "loss": 1.1944, + "step": 400 + }, + { + "epoch": 0.5408970976253298, + "grad_norm": 0.32927802205085754, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 410 + }, + { + "epoch": 0.554089709762533, + "grad_norm": 0.3336397409439087, + "learning_rate": 0.0002, + "loss": 1.2362, + "step": 420 + }, + { + "epoch": 0.5672823218997362, + "grad_norm": 0.4007597267627716, + "learning_rate": 0.0002, + "loss": 1.1445, + "step": 430 + }, + { + "epoch": 0.5804749340369393, + "grad_norm": 0.36144956946372986, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 440 + }, + { + "epoch": 0.5936675461741425, + "grad_norm": 0.6331009864807129, + "learning_rate": 0.0002, + "loss": 1.1604, + "step": 450 + }, + { + "epoch": 0.6068601583113457, + "grad_norm": 0.41469088196754456, + "learning_rate": 0.0002, + "loss": 1.1692, + "step": 460 + }, + { + "epoch": 0.6200527704485488, + "grad_norm": 0.4388185143470764, + "learning_rate": 0.0002, + "loss": 1.2372, + "step": 470 + }, + { + "epoch": 0.633245382585752, + "grad_norm": 0.3738141655921936, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 480 + }, + { + "epoch": 0.6464379947229552, + "grad_norm": 0.7212023138999939, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 490 + }, + { + "epoch": 0.6596306068601583, + "grad_norm": 0.2972351014614105, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 500 + }, + { + "epoch": 0.6728232189973615, + "grad_norm": 0.45293179154396057, + "learning_rate": 0.0002, + "loss": 1.1739, + "step": 510 + }, + { + "epoch": 0.6860158311345647, + "grad_norm": 0.4319860637187958, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 520 + }, + { + "epoch": 0.6992084432717678, + "grad_norm": 0.3050215542316437, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 530 + }, + { + "epoch": 0.712401055408971, + "grad_norm": 0.3552611172199249, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 540 + }, + { + "epoch": 0.7255936675461742, + "grad_norm": 0.3631151020526886, + "learning_rate": 0.0002, + "loss": 1.2546, + "step": 550 + }, + { + "epoch": 0.7387862796833773, + "grad_norm": 0.28177931904792786, + "learning_rate": 0.0002, + "loss": 1.1371, + "step": 560 + }, + { + "epoch": 0.7519788918205804, + "grad_norm": 0.359764039516449, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 570 + }, + { + "epoch": 0.7651715039577837, + "grad_norm": 0.3970327377319336, + "learning_rate": 0.0002, + "loss": 1.1356, + "step": 580 + }, + { + "epoch": 0.7783641160949868, + "grad_norm": 0.3541001081466675, + "learning_rate": 0.0002, + "loss": 1.309, + "step": 590 + }, + { + "epoch": 0.7915567282321899, + "grad_norm": 0.3478573262691498, + "learning_rate": 0.0002, + "loss": 1.2061, + "step": 600 + }, + { + "epoch": 0.8047493403693932, + "grad_norm": 0.3900321424007416, + "learning_rate": 0.0002, + "loss": 1.1864, + "step": 610 + }, + { + "epoch": 0.8179419525065963, + "grad_norm": 0.3443238437175751, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 620 + }, + { + "epoch": 0.8311345646437994, + "grad_norm": 0.44238781929016113, + "learning_rate": 0.0002, + "loss": 1.3232, + "step": 630 + }, + { + "epoch": 0.8443271767810027, + "grad_norm": 0.36339467763900757, + "learning_rate": 0.0002, + "loss": 1.1247, + "step": 640 + }, + { + "epoch": 0.8575197889182058, + "grad_norm": 0.6243070363998413, + "learning_rate": 0.0002, + "loss": 1.1049, + "step": 650 + }, + { + "epoch": 0.8707124010554089, + "grad_norm": 0.3209173381328583, + "learning_rate": 0.0002, + "loss": 1.1943, + "step": 660 + }, + { + "epoch": 0.8839050131926122, + "grad_norm": 0.35017991065979004, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 670 + }, + { + "epoch": 0.8970976253298153, + "grad_norm": 0.3247159421443939, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 680 + }, + { + "epoch": 0.9102902374670184, + "grad_norm": 0.4091894030570984, + "learning_rate": 0.0002, + "loss": 1.2298, + "step": 690 + }, + { + "epoch": 0.9234828496042217, + "grad_norm": 0.3975585997104645, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 700 + }, + { + "epoch": 0.9366754617414248, + "grad_norm": 0.3666245937347412, + "learning_rate": 0.0002, + "loss": 1.281, + "step": 710 + }, + { + "epoch": 0.9498680738786279, + "grad_norm": 0.45216917991638184, + "learning_rate": 0.0002, + "loss": 1.2855, + "step": 720 + }, + { + "epoch": 0.9630606860158312, + "grad_norm": 0.36108118295669556, + "learning_rate": 0.0002, + "loss": 1.2217, + "step": 730 + }, + { + "epoch": 0.9762532981530343, + "grad_norm": 0.44550251960754395, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 740 + }, + { + "epoch": 0.9894459102902374, + "grad_norm": 0.29801255464553833, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 750 + }, + { + "epoch": 1.0, + "eval_loss": 1.203244686126709, + "eval_runtime": 76.0457, + "eval_samples_per_second": 5.668, + "eval_steps_per_second": 0.71, + "step": 758 + }, + { + "epoch": 1.0026385224274406, + "grad_norm": 0.4096551239490509, + "learning_rate": 0.0002, + "loss": 1.0748, + "step": 760 + }, + { + "epoch": 1.0158311345646438, + "grad_norm": 0.2649582326412201, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 770 + }, + { + "epoch": 1.029023746701847, + "grad_norm": 0.3100722134113312, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 780 + }, + { + "epoch": 1.04221635883905, + "grad_norm": 0.3911755383014679, + "learning_rate": 0.0002, + "loss": 1.1736, + "step": 790 + }, + { + "epoch": 1.0554089709762533, + "grad_norm": 0.4600953757762909, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 800 + }, + { + "epoch": 1.0686015831134565, + "grad_norm": 0.28671619296073914, + "learning_rate": 0.0002, + "loss": 1.2869, + "step": 810 + }, + { + "epoch": 1.0817941952506596, + "grad_norm": 0.47282642126083374, + "learning_rate": 0.0002, + "loss": 1.1441, + "step": 820 + }, + { + "epoch": 1.0949868073878628, + "grad_norm": 0.690073549747467, + "learning_rate": 0.0002, + "loss": 1.1368, + "step": 830 + }, + { + "epoch": 1.108179419525066, + "grad_norm": 0.7317902445793152, + "learning_rate": 0.0002, + "loss": 0.9944, + "step": 840 + }, + { + "epoch": 1.121372031662269, + "grad_norm": 0.44215938448905945, + "learning_rate": 0.0002, + "loss": 1.108, + "step": 850 + }, + { + "epoch": 1.1345646437994723, + "grad_norm": 0.33875149488449097, + "learning_rate": 0.0002, + "loss": 1.0558, + "step": 860 + }, + { + "epoch": 1.1477572559366755, + "grad_norm": 0.3700002431869507, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 870 + }, + { + "epoch": 1.1609498680738786, + "grad_norm": 0.41173291206359863, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 880 + }, + { + "epoch": 1.1741424802110818, + "grad_norm": 0.5253589749336243, + "learning_rate": 0.0002, + "loss": 1.0312, + "step": 890 + }, + { + "epoch": 1.187335092348285, + "grad_norm": 0.3912237286567688, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 900 + }, + { + "epoch": 1.200527704485488, + "grad_norm": 0.40990331768989563, + "learning_rate": 0.0002, + "loss": 0.9434, + "step": 910 + }, + { + "epoch": 1.2137203166226913, + "grad_norm": 0.40377968549728394, + "learning_rate": 0.0002, + "loss": 1.0493, + "step": 920 + }, + { + "epoch": 1.2269129287598945, + "grad_norm": 0.4605846405029297, + "learning_rate": 0.0002, + "loss": 1.1496, + "step": 930 + }, + { + "epoch": 1.2401055408970976, + "grad_norm": 0.31564897298812866, + "learning_rate": 0.0002, + "loss": 1.1173, + "step": 940 + }, + { + "epoch": 1.2532981530343008, + "grad_norm": 0.39808550477027893, + "learning_rate": 0.0002, + "loss": 1.0547, + "step": 950 + }, + { + "epoch": 1.266490765171504, + "grad_norm": 0.3762115240097046, + "learning_rate": 0.0002, + "loss": 1.1367, + "step": 960 + }, + { + "epoch": 1.279683377308707, + "grad_norm": 0.4174984097480774, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 970 + }, + { + "epoch": 1.2928759894459103, + "grad_norm": 0.5263054966926575, + "learning_rate": 0.0002, + "loss": 1.1327, + "step": 980 + }, + { + "epoch": 1.3060686015831133, + "grad_norm": 0.41673699021339417, + "learning_rate": 0.0002, + "loss": 1.0339, + "step": 990 + }, + { + "epoch": 1.3192612137203166, + "grad_norm": 0.9613684415817261, + "learning_rate": 0.0002, + "loss": 1.1198, + "step": 1000 + }, + { + "epoch": 1.3324538258575198, + "grad_norm": 0.3690216839313507, + "learning_rate": 0.0002, + "loss": 1.0444, + "step": 1010 + }, + { + "epoch": 1.345646437994723, + "grad_norm": 0.521821141242981, + "learning_rate": 0.0002, + "loss": 1.0473, + "step": 1020 + }, + { + "epoch": 1.358839050131926, + "grad_norm": 0.3353094160556793, + "learning_rate": 0.0002, + "loss": 1.1065, + "step": 1030 + }, + { + "epoch": 1.3720316622691293, + "grad_norm": 0.3843843936920166, + "learning_rate": 0.0002, + "loss": 1.1286, + "step": 1040 + }, + { + "epoch": 1.3852242744063323, + "grad_norm": 0.372514545917511, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 1050 + }, + { + "epoch": 1.3984168865435356, + "grad_norm": 0.34537771344184875, + "learning_rate": 0.0002, + "loss": 1.0041, + "step": 1060 + }, + { + "epoch": 1.4116094986807388, + "grad_norm": 0.45349085330963135, + "learning_rate": 0.0002, + "loss": 1.0251, + "step": 1070 + }, + { + "epoch": 1.424802110817942, + "grad_norm": 0.5120177268981934, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 1080 + }, + { + "epoch": 1.437994722955145, + "grad_norm": 0.42800238728523254, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 1090 + }, + { + "epoch": 1.4511873350923483, + "grad_norm": 0.343832790851593, + "learning_rate": 0.0002, + "loss": 1.0734, + "step": 1100 + }, + { + "epoch": 1.4643799472295513, + "grad_norm": 0.3829841911792755, + "learning_rate": 0.0002, + "loss": 1.1286, + "step": 1110 + }, + { + "epoch": 1.4775725593667546, + "grad_norm": 0.4289931058883667, + "learning_rate": 0.0002, + "loss": 1.1268, + "step": 1120 + }, + { + "epoch": 1.4907651715039578, + "grad_norm": 0.42750850319862366, + "learning_rate": 0.0002, + "loss": 1.0676, + "step": 1130 + }, + { + "epoch": 1.503957783641161, + "grad_norm": 0.34328413009643555, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 1140 + }, + { + "epoch": 1.517150395778364, + "grad_norm": 0.349096417427063, + "learning_rate": 0.0002, + "loss": 1.0863, + "step": 1150 + }, + { + "epoch": 1.5303430079155673, + "grad_norm": 0.7700717449188232, + "learning_rate": 0.0002, + "loss": 1.1462, + "step": 1160 + }, + { + "epoch": 1.5435356200527703, + "grad_norm": 0.39294949173927307, + "learning_rate": 0.0002, + "loss": 0.9528, + "step": 1170 + }, + { + "epoch": 1.5567282321899736, + "grad_norm": 0.36173608899116516, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 1180 + }, + { + "epoch": 1.5699208443271768, + "grad_norm": 0.6034277677536011, + "learning_rate": 0.0002, + "loss": 1.1508, + "step": 1190 + }, + { + "epoch": 1.58311345646438, + "grad_norm": 0.36694103479385376, + "learning_rate": 0.0002, + "loss": 1.0105, + "step": 1200 + }, + { + "epoch": 1.596306068601583, + "grad_norm": 0.4727209508419037, + "learning_rate": 0.0002, + "loss": 1.1479, + "step": 1210 + }, + { + "epoch": 1.6094986807387863, + "grad_norm": 0.6482883095741272, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 1220 + }, + { + "epoch": 1.6226912928759893, + "grad_norm": 0.5238035917282104, + "learning_rate": 0.0002, + "loss": 1.1405, + "step": 1230 + }, + { + "epoch": 1.6358839050131926, + "grad_norm": 0.4812222421169281, + "learning_rate": 0.0002, + "loss": 1.0596, + "step": 1240 + }, + { + "epoch": 1.6490765171503958, + "grad_norm": 0.7131702303886414, + "learning_rate": 0.0002, + "loss": 1.2729, + "step": 1250 + }, + { + "epoch": 1.662269129287599, + "grad_norm": 0.3803327977657318, + "learning_rate": 0.0002, + "loss": 0.9832, + "step": 1260 + }, + { + "epoch": 1.675461741424802, + "grad_norm": 0.3745088577270508, + "learning_rate": 0.0002, + "loss": 1.1433, + "step": 1270 + }, + { + "epoch": 1.6886543535620053, + "grad_norm": 0.4427378475666046, + "learning_rate": 0.0002, + "loss": 1.1018, + "step": 1280 + }, + { + "epoch": 1.7018469656992083, + "grad_norm": 0.797478973865509, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 1290 + }, + { + "epoch": 1.7150395778364116, + "grad_norm": 0.503620982170105, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 1300 + }, + { + "epoch": 1.7282321899736148, + "grad_norm": 0.4132426381111145, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 1310 + }, + { + "epoch": 1.741424802110818, + "grad_norm": 0.41811656951904297, + "learning_rate": 0.0002, + "loss": 1.172, + "step": 1320 + }, + { + "epoch": 1.754617414248021, + "grad_norm": 0.40647849440574646, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1330 + }, + { + "epoch": 1.767810026385224, + "grad_norm": 0.42138347029685974, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 1340 + }, + { + "epoch": 1.7810026385224274, + "grad_norm": 0.46523579955101013, + "learning_rate": 0.0002, + "loss": 0.9955, + "step": 1350 + }, + { + "epoch": 1.7941952506596306, + "grad_norm": 0.39760419726371765, + "learning_rate": 0.0002, + "loss": 1.0307, + "step": 1360 + }, + { + "epoch": 1.8073878627968338, + "grad_norm": 0.37993717193603516, + "learning_rate": 0.0002, + "loss": 1.1218, + "step": 1370 + }, + { + "epoch": 1.820580474934037, + "grad_norm": 0.5404181480407715, + "learning_rate": 0.0002, + "loss": 1.0921, + "step": 1380 + }, + { + "epoch": 1.83377308707124, + "grad_norm": 0.4385245740413666, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1390 + }, + { + "epoch": 1.8469656992084431, + "grad_norm": 0.529797375202179, + "learning_rate": 0.0002, + "loss": 1.1175, + "step": 1400 + }, + { + "epoch": 1.8601583113456464, + "grad_norm": 0.481567919254303, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 1410 + }, + { + "epoch": 1.8733509234828496, + "grad_norm": 0.34787362813949585, + "learning_rate": 0.0002, + "loss": 1.0581, + "step": 1420 + }, + { + "epoch": 1.8865435356200528, + "grad_norm": 0.6402362585067749, + "learning_rate": 0.0002, + "loss": 1.0536, + "step": 1430 + }, + { + "epoch": 1.899736147757256, + "grad_norm": 0.3461322784423828, + "learning_rate": 0.0002, + "loss": 1.0787, + "step": 1440 + }, + { + "epoch": 1.912928759894459, + "grad_norm": 0.44005653262138367, + "learning_rate": 0.0002, + "loss": 1.0925, + "step": 1450 + }, + { + "epoch": 1.9261213720316621, + "grad_norm": 0.4064280688762665, + "learning_rate": 0.0002, + "loss": 1.0414, + "step": 1460 + }, + { + "epoch": 1.9393139841688654, + "grad_norm": 0.5236523151397705, + "learning_rate": 0.0002, + "loss": 1.0608, + "step": 1470 + }, + { + "epoch": 1.9525065963060686, + "grad_norm": 0.41030219197273254, + "learning_rate": 0.0002, + "loss": 1.0572, + "step": 1480 + }, + { + "epoch": 1.9656992084432718, + "grad_norm": 0.39805835485458374, + "learning_rate": 0.0002, + "loss": 1.1204, + "step": 1490 + }, + { + "epoch": 1.978891820580475, + "grad_norm": 0.42974501848220825, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 1500 + }, + { + "epoch": 1.992084432717678, + "grad_norm": 0.4688243865966797, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 1510 + }, + { + "epoch": 2.0, + "eval_loss": 1.1874967813491821, + "eval_runtime": 71.9523, + "eval_samples_per_second": 5.99, + "eval_steps_per_second": 0.75, + "step": 1516 + }, + { + "epoch": 2.005277044854881, + "grad_norm": 0.4121631383895874, + "learning_rate": 0.0002, + "loss": 1.0401, + "step": 1520 + }, + { + "epoch": 2.0184696569920844, + "grad_norm": 0.4844197928905487, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 1530 + }, + { + "epoch": 2.0316622691292876, + "grad_norm": 0.45408546924591064, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 1540 + }, + { + "epoch": 2.044854881266491, + "grad_norm": 0.48662951588630676, + "learning_rate": 0.0002, + "loss": 1.0399, + "step": 1550 + }, + { + "epoch": 2.058047493403694, + "grad_norm": 0.7195899486541748, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 1560 + }, + { + "epoch": 2.0712401055408973, + "grad_norm": 0.5071077346801758, + "learning_rate": 0.0002, + "loss": 0.9486, + "step": 1570 + }, + { + "epoch": 2.0844327176781, + "grad_norm": 0.7473958730697632, + "learning_rate": 0.0002, + "loss": 1.0055, + "step": 1580 + }, + { + "epoch": 2.0976253298153034, + "grad_norm": 0.5509232878684998, + "learning_rate": 0.0002, + "loss": 0.8309, + "step": 1590 + }, + { + "epoch": 2.1108179419525066, + "grad_norm": 0.5108042359352112, + "learning_rate": 0.0002, + "loss": 0.9181, + "step": 1600 + }, + { + "epoch": 2.12401055408971, + "grad_norm": 0.42331448197364807, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 1610 + }, + { + "epoch": 2.137203166226913, + "grad_norm": 0.46621623635292053, + "learning_rate": 0.0002, + "loss": 0.9359, + "step": 1620 + }, + { + "epoch": 2.150395778364116, + "grad_norm": 0.43802836537361145, + "learning_rate": 0.0002, + "loss": 0.9065, + "step": 1630 + }, + { + "epoch": 2.163588390501319, + "grad_norm": 0.49908021092414856, + "learning_rate": 0.0002, + "loss": 0.9375, + "step": 1640 + }, + { + "epoch": 2.1767810026385224, + "grad_norm": 0.4195636808872223, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 1650 + }, + { + "epoch": 2.1899736147757256, + "grad_norm": 0.49515822529792786, + "learning_rate": 0.0002, + "loss": 0.8501, + "step": 1660 + }, + { + "epoch": 2.203166226912929, + "grad_norm": 0.4607589542865753, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 1670 + }, + { + "epoch": 2.216358839050132, + "grad_norm": 0.4489196836948395, + "learning_rate": 0.0002, + "loss": 1.0206, + "step": 1680 + }, + { + "epoch": 2.229551451187335, + "grad_norm": 0.49300864338874817, + "learning_rate": 0.0002, + "loss": 0.9657, + "step": 1690 + }, + { + "epoch": 2.242744063324538, + "grad_norm": 0.6624954342842102, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 1700 + }, + { + "epoch": 2.2559366754617414, + "grad_norm": 0.8391500115394592, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 1710 + }, + { + "epoch": 2.2691292875989446, + "grad_norm": 0.5193073749542236, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 1720 + }, + { + "epoch": 2.282321899736148, + "grad_norm": 0.6180613040924072, + "learning_rate": 0.0002, + "loss": 0.9979, + "step": 1730 + }, + { + "epoch": 2.295514511873351, + "grad_norm": 0.591191291809082, + "learning_rate": 0.0002, + "loss": 0.9579, + "step": 1740 + }, + { + "epoch": 2.308707124010554, + "grad_norm": 0.546897828578949, + "learning_rate": 0.0002, + "loss": 0.8879, + "step": 1750 + }, + { + "epoch": 2.321899736147757, + "grad_norm": 0.5470401644706726, + "learning_rate": 0.0002, + "loss": 0.9321, + "step": 1760 + }, + { + "epoch": 2.3350923482849604, + "grad_norm": 0.4590282738208771, + "learning_rate": 0.0002, + "loss": 0.9104, + "step": 1770 + }, + { + "epoch": 2.3482849604221636, + "grad_norm": 0.622164785861969, + "learning_rate": 0.0002, + "loss": 1.0384, + "step": 1780 + }, + { + "epoch": 2.361477572559367, + "grad_norm": 0.5753812193870544, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 1790 + }, + { + "epoch": 2.37467018469657, + "grad_norm": 0.47958624362945557, + "learning_rate": 0.0002, + "loss": 0.9569, + "step": 1800 + }, + { + "epoch": 2.387862796833773, + "grad_norm": 0.48042672872543335, + "learning_rate": 0.0002, + "loss": 1.0519, + "step": 1810 + }, + { + "epoch": 2.401055408970976, + "grad_norm": 0.44586366415023804, + "learning_rate": 0.0002, + "loss": 0.9042, + "step": 1820 + }, + { + "epoch": 2.4142480211081794, + "grad_norm": 0.7239416837692261, + "learning_rate": 0.0002, + "loss": 0.9783, + "step": 1830 + }, + { + "epoch": 2.4274406332453826, + "grad_norm": 0.5515341758728027, + "learning_rate": 0.0002, + "loss": 0.8818, + "step": 1840 + }, + { + "epoch": 2.440633245382586, + "grad_norm": 0.6280064582824707, + "learning_rate": 0.0002, + "loss": 0.9503, + "step": 1850 + }, + { + "epoch": 2.453825857519789, + "grad_norm": 0.4832057058811188, + "learning_rate": 0.0002, + "loss": 0.8943, + "step": 1860 + }, + { + "epoch": 2.467018469656992, + "grad_norm": 0.5789321064949036, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 1870 + }, + { + "epoch": 2.480211081794195, + "grad_norm": 0.48491886258125305, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 1880 + }, + { + "epoch": 2.4934036939313984, + "grad_norm": 0.532365620136261, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 1890 + }, + { + "epoch": 2.5065963060686016, + "grad_norm": 0.7087852954864502, + "learning_rate": 0.0002, + "loss": 0.9734, + "step": 1900 + }, + { + "epoch": 2.519788918205805, + "grad_norm": 0.48157402873039246, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 1910 + }, + { + "epoch": 2.532981530343008, + "grad_norm": 0.5886041522026062, + "learning_rate": 0.0002, + "loss": 0.9851, + "step": 1920 + }, + { + "epoch": 2.5461741424802113, + "grad_norm": 0.6332622766494751, + "learning_rate": 0.0002, + "loss": 1.0144, + "step": 1930 + }, + { + "epoch": 2.559366754617414, + "grad_norm": 0.5463117957115173, + "learning_rate": 0.0002, + "loss": 0.9516, + "step": 1940 + }, + { + "epoch": 2.5725593667546174, + "grad_norm": 0.5432228446006775, + "learning_rate": 0.0002, + "loss": 0.9373, + "step": 1950 + }, + { + "epoch": 2.5857519788918206, + "grad_norm": 0.5929186940193176, + "learning_rate": 0.0002, + "loss": 0.8974, + "step": 1960 + }, + { + "epoch": 2.598944591029024, + "grad_norm": 0.5120641589164734, + "learning_rate": 0.0002, + "loss": 1.0062, + "step": 1970 + }, + { + "epoch": 2.6121372031662267, + "grad_norm": 0.5372339487075806, + "learning_rate": 0.0002, + "loss": 0.9143, + "step": 1980 + }, + { + "epoch": 2.62532981530343, + "grad_norm": 0.5519838929176331, + "learning_rate": 0.0002, + "loss": 1.0786, + "step": 1990 + }, + { + "epoch": 2.638522427440633, + "grad_norm": 0.7304037809371948, + "learning_rate": 0.0002, + "loss": 1.021, + "step": 2000 + }, + { + "epoch": 2.6517150395778364, + "grad_norm": 0.6182340979576111, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 2010 + }, + { + "epoch": 2.6649076517150396, + "grad_norm": 0.4874444305896759, + "learning_rate": 0.0002, + "loss": 1.0501, + "step": 2020 + }, + { + "epoch": 2.678100263852243, + "grad_norm": 0.5850239396095276, + "learning_rate": 0.0002, + "loss": 0.9612, + "step": 2030 + }, + { + "epoch": 2.691292875989446, + "grad_norm": 0.6495311856269836, + "learning_rate": 0.0002, + "loss": 0.9808, + "step": 2040 + }, + { + "epoch": 2.7044854881266494, + "grad_norm": 1.002830147743225, + "learning_rate": 0.0002, + "loss": 1.0187, + "step": 2050 + }, + { + "epoch": 2.717678100263852, + "grad_norm": 0.49076753854751587, + "learning_rate": 0.0002, + "loss": 1.0386, + "step": 2060 + }, + { + "epoch": 2.7308707124010554, + "grad_norm": 0.4736326336860657, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 2070 + }, + { + "epoch": 2.7440633245382586, + "grad_norm": 0.5527601838111877, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 2080 + }, + { + "epoch": 2.757255936675462, + "grad_norm": 0.7295718193054199, + "learning_rate": 0.0002, + "loss": 0.9149, + "step": 2090 + }, + { + "epoch": 2.7704485488126647, + "grad_norm": 0.5437536835670471, + "learning_rate": 0.0002, + "loss": 0.9032, + "step": 2100 + }, + { + "epoch": 2.783641160949868, + "grad_norm": 0.5997128486633301, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2110 + }, + { + "epoch": 2.796833773087071, + "grad_norm": 0.6498191356658936, + "learning_rate": 0.0002, + "loss": 0.976, + "step": 2120 + }, + { + "epoch": 2.8100263852242744, + "grad_norm": 0.5237268805503845, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 2130 + }, + { + "epoch": 2.8232189973614776, + "grad_norm": 0.6033027172088623, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 2140 + }, + { + "epoch": 2.836411609498681, + "grad_norm": 0.6077138781547546, + "learning_rate": 0.0002, + "loss": 0.9625, + "step": 2150 + }, + { + "epoch": 2.849604221635884, + "grad_norm": 0.4127797484397888, + "learning_rate": 0.0002, + "loss": 0.9347, + "step": 2160 + }, + { + "epoch": 2.862796833773087, + "grad_norm": 0.8448635339736938, + "learning_rate": 0.0002, + "loss": 1.0459, + "step": 2170 + }, + { + "epoch": 2.87598944591029, + "grad_norm": 0.5669729113578796, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 2180 + }, + { + "epoch": 2.8891820580474934, + "grad_norm": 0.510231077671051, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 2190 + }, + { + "epoch": 2.9023746701846966, + "grad_norm": 0.8072245121002197, + "learning_rate": 0.0002, + "loss": 0.9267, + "step": 2200 + }, + { + "epoch": 2.9155672823219, + "grad_norm": 0.6055923104286194, + "learning_rate": 0.0002, + "loss": 0.9685, + "step": 2210 + }, + { + "epoch": 2.9287598944591027, + "grad_norm": 0.7384416460990906, + "learning_rate": 0.0002, + "loss": 0.9157, + "step": 2220 + }, + { + "epoch": 2.941952506596306, + "grad_norm": 0.4922751784324646, + "learning_rate": 0.0002, + "loss": 0.9712, + "step": 2230 + }, + { + "epoch": 2.955145118733509, + "grad_norm": 0.6039906740188599, + "learning_rate": 0.0002, + "loss": 0.9487, + "step": 2240 + }, + { + "epoch": 2.9683377308707124, + "grad_norm": 0.4751701354980469, + "learning_rate": 0.0002, + "loss": 0.9776, + "step": 2250 + }, + { + "epoch": 2.9815303430079156, + "grad_norm": 0.5698353052139282, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 2260 + }, + { + "epoch": 2.994722955145119, + "grad_norm": 0.893563449382782, + "learning_rate": 0.0002, + "loss": 1.1184, + "step": 2270 + }, + { + "epoch": 3.0, + "eval_loss": 1.2046419382095337, + "eval_runtime": 71.5992, + "eval_samples_per_second": 6.02, + "eval_steps_per_second": 0.754, + "step": 2274 + }, + { + "epoch": 3.007915567282322, + "grad_norm": 0.41119325160980225, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 2280 + }, + { + "epoch": 3.021108179419525, + "grad_norm": 0.8169420957565308, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 2290 + }, + { + "epoch": 3.034300791556728, + "grad_norm": 0.6033818125724792, + "learning_rate": 0.0002, + "loss": 0.794, + "step": 2300 + }, + { + "epoch": 3.0474934036939314, + "grad_norm": 0.9600058197975159, + "learning_rate": 0.0002, + "loss": 0.7607, + "step": 2310 + }, + { + "epoch": 3.0606860158311346, + "grad_norm": 0.5859250426292419, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 2320 + }, + { + "epoch": 3.073878627968338, + "grad_norm": 0.6758618950843811, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 2330 + }, + { + "epoch": 3.0870712401055407, + "grad_norm": 0.8407140970230103, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 2340 + }, + { + "epoch": 3.100263852242744, + "grad_norm": 0.767779529094696, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 2350 + }, + { + "epoch": 3.113456464379947, + "grad_norm": 0.5572896599769592, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 2360 + }, + { + "epoch": 3.1266490765171504, + "grad_norm": 0.5908368825912476, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 2370 + }, + { + "epoch": 3.1398416886543536, + "grad_norm": 0.8047826290130615, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 2380 + }, + { + "epoch": 3.153034300791557, + "grad_norm": 0.8041718006134033, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 2390 + }, + { + "epoch": 3.16622691292876, + "grad_norm": 0.57078617811203, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 2400 + }, + { + "epoch": 3.179419525065963, + "grad_norm": 0.5125322937965393, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 2410 + }, + { + "epoch": 3.192612137203166, + "grad_norm": 0.6356934309005737, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 2420 + }, + { + "epoch": 3.2058047493403694, + "grad_norm": 1.0129680633544922, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 2430 + }, + { + "epoch": 3.2189973614775726, + "grad_norm": 0.8104226589202881, + "learning_rate": 0.0002, + "loss": 0.8316, + "step": 2440 + }, + { + "epoch": 3.232189973614776, + "grad_norm": 0.7276079058647156, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 2450 + }, + { + "epoch": 3.2453825857519787, + "grad_norm": 0.9753884077072144, + "learning_rate": 0.0002, + "loss": 0.8183, + "step": 2460 + }, + { + "epoch": 3.258575197889182, + "grad_norm": 0.9753183722496033, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 2470 + }, + { + "epoch": 3.271767810026385, + "grad_norm": 0.6791225075721741, + "learning_rate": 0.0002, + "loss": 0.8815, + "step": 2480 + }, + { + "epoch": 3.2849604221635884, + "grad_norm": 0.6797150373458862, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 2490 + }, + { + "epoch": 3.2981530343007917, + "grad_norm": 0.8107194900512695, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 2500 + }, + { + "epoch": 3.311345646437995, + "grad_norm": 0.5878375172615051, + "learning_rate": 0.0002, + "loss": 0.7869, + "step": 2510 + }, + { + "epoch": 3.324538258575198, + "grad_norm": 0.5882975459098816, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 2520 + }, + { + "epoch": 3.337730870712401, + "grad_norm": 0.6180013418197632, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 2530 + }, + { + "epoch": 3.350923482849604, + "grad_norm": 1.0008151531219482, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 2540 + }, + { + "epoch": 3.3641160949868074, + "grad_norm": 0.6404656767845154, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 2550 + }, + { + "epoch": 3.3773087071240107, + "grad_norm": 0.8481354117393494, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 2560 + }, + { + "epoch": 3.390501319261214, + "grad_norm": 0.8068035244941711, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 2570 + }, + { + "epoch": 3.4036939313984167, + "grad_norm": 0.7477166056632996, + "learning_rate": 0.0002, + "loss": 0.8621, + "step": 2580 + }, + { + "epoch": 3.41688654353562, + "grad_norm": 0.6202635765075684, + "learning_rate": 0.0002, + "loss": 0.8352, + "step": 2590 + }, + { + "epoch": 3.430079155672823, + "grad_norm": 0.6981159448623657, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 2600 + }, + { + "epoch": 3.4432717678100264, + "grad_norm": 0.6611084342002869, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 2610 + }, + { + "epoch": 3.4564643799472297, + "grad_norm": 0.5727696418762207, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 2620 + }, + { + "epoch": 3.469656992084433, + "grad_norm": 1.2354545593261719, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 2630 + }, + { + "epoch": 3.4828496042216357, + "grad_norm": 0.6347638368606567, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 2640 + }, + { + "epoch": 3.496042216358839, + "grad_norm": 0.6975704431533813, + "learning_rate": 0.0002, + "loss": 0.8426, + "step": 2650 + }, + { + "epoch": 3.509234828496042, + "grad_norm": 0.6569573879241943, + "learning_rate": 0.0002, + "loss": 0.8773, + "step": 2660 + }, + { + "epoch": 3.5224274406332454, + "grad_norm": 0.6979609131813049, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 2670 + }, + { + "epoch": 3.5356200527704487, + "grad_norm": 0.6287988424301147, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2680 + }, + { + "epoch": 3.5488126649076515, + "grad_norm": 0.8682637214660645, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 2690 + }, + { + "epoch": 3.5620052770448547, + "grad_norm": 0.7062831521034241, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 2700 + }, + { + "epoch": 3.575197889182058, + "grad_norm": 1.0061452388763428, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 2710 + }, + { + "epoch": 3.588390501319261, + "grad_norm": 0.719097375869751, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 2720 + }, + { + "epoch": 3.6015831134564644, + "grad_norm": 0.7583496570587158, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 2730 + }, + { + "epoch": 3.6147757255936677, + "grad_norm": 0.7543531060218811, + "learning_rate": 0.0002, + "loss": 0.91, + "step": 2740 + }, + { + "epoch": 3.627968337730871, + "grad_norm": 0.8873646855354309, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2750 + }, + { + "epoch": 3.641160949868074, + "grad_norm": 1.0657562017440796, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 2760 + }, + { + "epoch": 3.654353562005277, + "grad_norm": 0.8641113638877869, + "learning_rate": 0.0002, + "loss": 0.8291, + "step": 2770 + }, + { + "epoch": 3.66754617414248, + "grad_norm": 0.6620645523071289, + "learning_rate": 0.0002, + "loss": 0.8302, + "step": 2780 + }, + { + "epoch": 3.6807387862796834, + "grad_norm": 0.6919541954994202, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 2790 + }, + { + "epoch": 3.6939313984168867, + "grad_norm": 0.7305743098258972, + "learning_rate": 0.0002, + "loss": 0.8388, + "step": 2800 + }, + { + "epoch": 3.7071240105540895, + "grad_norm": 0.7464777827262878, + "learning_rate": 0.0002, + "loss": 0.8053, + "step": 2810 + }, + { + "epoch": 3.7203166226912927, + "grad_norm": 0.8067063093185425, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 2820 + }, + { + "epoch": 3.733509234828496, + "grad_norm": 0.7789416313171387, + "learning_rate": 0.0002, + "loss": 0.8259, + "step": 2830 + }, + { + "epoch": 3.746701846965699, + "grad_norm": 0.507529079914093, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 2840 + }, + { + "epoch": 3.7598944591029024, + "grad_norm": 0.6509260535240173, + "learning_rate": 0.0002, + "loss": 0.832, + "step": 2850 + }, + { + "epoch": 3.7730870712401057, + "grad_norm": 0.9141367673873901, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 2860 + }, + { + "epoch": 3.786279683377309, + "grad_norm": 0.7852635979652405, + "learning_rate": 0.0002, + "loss": 0.9436, + "step": 2870 + }, + { + "epoch": 3.7994722955145117, + "grad_norm": 0.5340318083763123, + "learning_rate": 0.0002, + "loss": 0.8842, + "step": 2880 + }, + { + "epoch": 3.812664907651715, + "grad_norm": 0.6246042847633362, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 2890 + }, + { + "epoch": 3.825857519788918, + "grad_norm": 0.7064066529273987, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 2900 + }, + { + "epoch": 3.8390501319261214, + "grad_norm": 0.6144065856933594, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 2910 + }, + { + "epoch": 3.8522427440633247, + "grad_norm": 0.5268424153327942, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 2920 + }, + { + "epoch": 3.8654353562005275, + "grad_norm": 0.9508116841316223, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 2930 + }, + { + "epoch": 3.8786279683377307, + "grad_norm": 0.9133715629577637, + "learning_rate": 0.0002, + "loss": 0.8437, + "step": 2940 + }, + { + "epoch": 3.891820580474934, + "grad_norm": 1.0144646167755127, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 2950 + }, + { + "epoch": 3.905013192612137, + "grad_norm": 0.6397877931594849, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2960 + }, + { + "epoch": 3.9182058047493404, + "grad_norm": 0.734835147857666, + "learning_rate": 0.0002, + "loss": 0.8285, + "step": 2970 + }, + { + "epoch": 3.9313984168865437, + "grad_norm": 0.784853994846344, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 2980 + }, + { + "epoch": 3.944591029023747, + "grad_norm": 0.805831789970398, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 2990 + }, + { + "epoch": 3.9577836411609497, + "grad_norm": 0.6299595236778259, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 3000 + }, + { + "epoch": 3.970976253298153, + "grad_norm": 0.6264058351516724, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3010 + }, + { + "epoch": 3.984168865435356, + "grad_norm": 0.6419739723205566, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 3020 + }, + { + "epoch": 3.9973614775725594, + "grad_norm": 0.7737036943435669, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 3030 + }, + { + "epoch": 4.0, + "eval_loss": 1.2454297542572021, + "eval_runtime": 71.8558, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.752, + "step": 3032 + }, + { + "epoch": 4.010554089709762, + "grad_norm": 1.092727541923523, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 3040 + }, + { + "epoch": 4.0237467018469655, + "grad_norm": 0.8087759613990784, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 3050 + }, + { + "epoch": 4.036939313984169, + "grad_norm": 0.8106053471565247, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 3060 + }, + { + "epoch": 4.050131926121372, + "grad_norm": 0.8675326704978943, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 3070 + }, + { + "epoch": 4.063324538258575, + "grad_norm": 0.9620490074157715, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 3080 + }, + { + "epoch": 4.076517150395778, + "grad_norm": 0.8996296525001526, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 3090 + }, + { + "epoch": 4.089709762532982, + "grad_norm": 0.8648998737335205, + "learning_rate": 0.0002, + "loss": 0.6111, + "step": 3100 + }, + { + "epoch": 4.102902374670185, + "grad_norm": 1.0321335792541504, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 3110 + }, + { + "epoch": 4.116094986807388, + "grad_norm": 0.7949225306510925, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 3120 + }, + { + "epoch": 4.129287598944591, + "grad_norm": 0.9684646129608154, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 3130 + }, + { + "epoch": 4.142480211081795, + "grad_norm": 0.8698066473007202, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 3140 + }, + { + "epoch": 4.155672823218997, + "grad_norm": 0.7688450813293457, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 3150 + }, + { + "epoch": 4.1688654353562, + "grad_norm": 0.9682092070579529, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 3160 + }, + { + "epoch": 4.1820580474934035, + "grad_norm": 0.961561918258667, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 3170 + }, + { + "epoch": 4.195250659630607, + "grad_norm": 1.3962990045547485, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 3180 + }, + { + "epoch": 4.20844327176781, + "grad_norm": 0.9485045075416565, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 3190 + }, + { + "epoch": 4.221635883905013, + "grad_norm": 0.7768281698226929, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 3200 + }, + { + "epoch": 4.2348284960422165, + "grad_norm": 1.2685691118240356, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 3210 + }, + { + "epoch": 4.24802110817942, + "grad_norm": 0.6876471638679504, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 3220 + }, + { + "epoch": 4.261213720316623, + "grad_norm": 1.0074554681777954, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 3230 + }, + { + "epoch": 4.274406332453826, + "grad_norm": 0.8094777464866638, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 3240 + }, + { + "epoch": 4.287598944591029, + "grad_norm": 0.7906569242477417, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 3250 + }, + { + "epoch": 4.300791556728232, + "grad_norm": 0.840238630771637, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 3260 + }, + { + "epoch": 4.313984168865435, + "grad_norm": 1.0119295120239258, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 3270 + }, + { + "epoch": 4.327176781002638, + "grad_norm": 0.7943191528320312, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 3280 + }, + { + "epoch": 4.3403693931398415, + "grad_norm": 0.7691723704338074, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 3290 + }, + { + "epoch": 4.353562005277045, + "grad_norm": 0.7227770686149597, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 3300 + }, + { + "epoch": 4.366754617414248, + "grad_norm": 0.8512253165245056, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 3310 + }, + { + "epoch": 4.379947229551451, + "grad_norm": 0.7852529287338257, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 3320 + }, + { + "epoch": 4.3931398416886545, + "grad_norm": 0.8888797163963318, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 3330 + }, + { + "epoch": 4.406332453825858, + "grad_norm": 0.9522430896759033, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 3340 + }, + { + "epoch": 4.419525065963061, + "grad_norm": 0.900276780128479, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 3350 + }, + { + "epoch": 4.432717678100264, + "grad_norm": 1.181547999382019, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 3360 + }, + { + "epoch": 4.445910290237467, + "grad_norm": 0.903142511844635, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 3370 + }, + { + "epoch": 4.45910290237467, + "grad_norm": 0.8747565150260925, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 3380 + }, + { + "epoch": 4.472295514511873, + "grad_norm": 0.7838051319122314, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 3390 + }, + { + "epoch": 4.485488126649076, + "grad_norm": 0.8691313862800598, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 3400 + }, + { + "epoch": 4.4986807387862795, + "grad_norm": 0.8493868708610535, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 3410 + }, + { + "epoch": 4.511873350923483, + "grad_norm": 1.0104830265045166, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 3420 + }, + { + "epoch": 4.525065963060686, + "grad_norm": 1.1716967821121216, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 3430 + }, + { + "epoch": 4.538258575197889, + "grad_norm": 0.9122593998908997, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 3440 + }, + { + "epoch": 4.5514511873350925, + "grad_norm": 0.829090416431427, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 3450 + }, + { + "epoch": 4.564643799472296, + "grad_norm": 1.141662836074829, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 3460 + }, + { + "epoch": 4.577836411609499, + "grad_norm": 0.8423182368278503, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 3470 + }, + { + "epoch": 4.591029023746702, + "grad_norm": 0.8024184703826904, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 3480 + }, + { + "epoch": 4.6042216358839045, + "grad_norm": 0.7703381776809692, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 3490 + }, + { + "epoch": 4.617414248021108, + "grad_norm": 0.9883959293365479, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 3500 + }, + { + "epoch": 4.630606860158311, + "grad_norm": 0.9554709196090698, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 3510 + }, + { + "epoch": 4.643799472295514, + "grad_norm": 1.9949709177017212, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 3520 + }, + { + "epoch": 4.6569920844327175, + "grad_norm": 0.7762255072593689, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 3530 + }, + { + "epoch": 4.670184696569921, + "grad_norm": 0.9538425803184509, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 3540 + }, + { + "epoch": 4.683377308707124, + "grad_norm": 1.0279661417007446, + "learning_rate": 0.0002, + "loss": 0.6352, + "step": 3550 + }, + { + "epoch": 4.696569920844327, + "grad_norm": 0.7545472979545593, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 3560 + }, + { + "epoch": 4.7097625329815305, + "grad_norm": 0.8919376730918884, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 3570 + }, + { + "epoch": 4.722955145118734, + "grad_norm": 0.7621569633483887, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 3580 + }, + { + "epoch": 4.736147757255937, + "grad_norm": 1.205320119857788, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3590 + }, + { + "epoch": 4.74934036939314, + "grad_norm": 1.0642725229263306, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 3600 + }, + { + "epoch": 4.762532981530343, + "grad_norm": 0.9402666687965393, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 3610 + }, + { + "epoch": 4.775725593667546, + "grad_norm": 1.254127025604248, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 3620 + }, + { + "epoch": 4.788918205804749, + "grad_norm": 0.7609598636627197, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 3630 + }, + { + "epoch": 4.802110817941952, + "grad_norm": 0.8240329623222351, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 3640 + }, + { + "epoch": 4.8153034300791555, + "grad_norm": 0.8356260657310486, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 3650 + }, + { + "epoch": 4.828496042216359, + "grad_norm": 0.9130708575248718, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 3660 + }, + { + "epoch": 4.841688654353562, + "grad_norm": 0.9384765028953552, + "learning_rate": 0.0002, + "loss": 0.7269, + "step": 3670 + }, + { + "epoch": 4.854881266490765, + "grad_norm": 0.9829966425895691, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 3680 + }, + { + "epoch": 4.8680738786279685, + "grad_norm": 1.0488632917404175, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 3690 + }, + { + "epoch": 4.881266490765172, + "grad_norm": 1.2278969287872314, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 3700 + }, + { + "epoch": 4.894459102902375, + "grad_norm": 0.8078970313072205, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 3710 + }, + { + "epoch": 4.907651715039578, + "grad_norm": 0.8081700205802917, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 3720 + }, + { + "epoch": 4.9208443271767806, + "grad_norm": 0.9204511046409607, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 3730 + }, + { + "epoch": 4.934036939313984, + "grad_norm": 0.9326391220092773, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 3740 + }, + { + "epoch": 4.947229551451187, + "grad_norm": 1.0089969635009766, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 3750 + }, + { + "epoch": 4.96042216358839, + "grad_norm": 0.7063466906547546, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 3760 + }, + { + "epoch": 4.9736147757255935, + "grad_norm": 1.2603905200958252, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 3770 + }, + { + "epoch": 4.986807387862797, + "grad_norm": 0.8418653607368469, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 3780 + }, + { + "epoch": 5.0, + "grad_norm": 0.9537181854248047, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 3790 + }, + { + "epoch": 5.0, + "eval_loss": 1.3319307565689087, + "eval_runtime": 71.7836, + "eval_samples_per_second": 6.004, + "eval_steps_per_second": 0.752, + "step": 3790 + } + ], + "logging_steps": 10, + "max_steps": 6064, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7539271384629248e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0c0289389bcb2236bf804b0a82c50e67c11a1379 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-3790/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e04722e246164eebe6b4f70dd50fe946c936a2bd25edd75bcedcfd4c7ff9474e +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d58f858ad088d38b21f8be0ed7f58b325b4aba04 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef9f73d5fd3184bc293992ab3c38faced23ba0367cd0768bb566be34c6861705 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6500666be44c7f66e77bb4689343910dd521af2a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d59f1e59b630c190dbe5cd9de87fa7e21cf3cb6803da15bd5e5e5b072a6ff279 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a23ced4bf83760c170d58a0931cff3be8e47dbb4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:156c1fb26960b62490dc858678bd0413d97eace874f59f020eed53bf882d6f03 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7faa42c189a4a90022e06e0227738a2582d9113 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f90e6ac64ccfa347b33ab88ae2d4018708fde11ebd0c0fec3bb30eceecfffb7 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fcd18c5fb86be6a181d92fe0c769bcd9e64bbcbf --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/trainer_state.json @@ -0,0 +1,3259 @@ +{ + "best_metric": 1.1874967813491821, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 4548, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013192612137203167, + "grad_norm": 0.7545632123947144, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 10 + }, + { + "epoch": 0.026385224274406333, + "grad_norm": 0.5787661075592041, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 20 + }, + { + "epoch": 0.0395778364116095, + "grad_norm": 0.8616093993186951, + "learning_rate": 0.0002, + "loss": 1.3906, + "step": 30 + }, + { + "epoch": 0.052770448548812667, + "grad_norm": 0.42088547348976135, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 40 + }, + { + "epoch": 0.06596306068601583, + "grad_norm": 0.47704678773880005, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 50 + }, + { + "epoch": 0.079155672823219, + "grad_norm": 0.5763994455337524, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 60 + }, + { + "epoch": 0.09234828496042216, + "grad_norm": 0.4579846262931824, + "learning_rate": 0.0002, + "loss": 1.2268, + "step": 70 + }, + { + "epoch": 0.10554089709762533, + "grad_norm": 0.46623846888542175, + "learning_rate": 0.0002, + "loss": 1.415, + "step": 80 + }, + { + "epoch": 0.11873350923482849, + "grad_norm": 0.4206956624984741, + "learning_rate": 0.0002, + "loss": 1.2849, + "step": 90 + }, + { + "epoch": 0.13192612137203166, + "grad_norm": 0.41896629333496094, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 100 + }, + { + "epoch": 0.14511873350923482, + "grad_norm": 0.3459089398384094, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 110 + }, + { + "epoch": 0.158311345646438, + "grad_norm": 0.4587327837944031, + "learning_rate": 0.0002, + "loss": 1.2858, + "step": 120 + }, + { + "epoch": 0.17150395778364116, + "grad_norm": 0.433525413274765, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 130 + }, + { + "epoch": 0.18469656992084432, + "grad_norm": 0.39253175258636475, + "learning_rate": 0.0002, + "loss": 1.2269, + "step": 140 + }, + { + "epoch": 0.19788918205804748, + "grad_norm": 0.3602290749549866, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 150 + }, + { + "epoch": 0.21108179419525067, + "grad_norm": 0.41160839796066284, + "learning_rate": 0.0002, + "loss": 1.2834, + "step": 160 + }, + { + "epoch": 0.22427440633245382, + "grad_norm": 0.7213630080223083, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 170 + }, + { + "epoch": 0.23746701846965698, + "grad_norm": 0.39086055755615234, + "learning_rate": 0.0002, + "loss": 1.2215, + "step": 180 + }, + { + "epoch": 0.25065963060686014, + "grad_norm": 0.4465520977973938, + "learning_rate": 0.0002, + "loss": 1.3315, + "step": 190 + }, + { + "epoch": 0.2638522427440633, + "grad_norm": 1.814679741859436, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 200 + }, + { + "epoch": 0.2770448548812665, + "grad_norm": 0.5026423931121826, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 210 + }, + { + "epoch": 0.29023746701846964, + "grad_norm": 0.4156292974948883, + "learning_rate": 0.0002, + "loss": 1.3306, + "step": 220 + }, + { + "epoch": 0.3034300791556728, + "grad_norm": 0.40813493728637695, + "learning_rate": 0.0002, + "loss": 1.266, + "step": 230 + }, + { + "epoch": 0.316622691292876, + "grad_norm": 0.3304787874221802, + "learning_rate": 0.0002, + "loss": 1.1533, + "step": 240 + }, + { + "epoch": 0.32981530343007914, + "grad_norm": 0.46139976382255554, + "learning_rate": 0.0002, + "loss": 1.3154, + "step": 250 + }, + { + "epoch": 0.34300791556728233, + "grad_norm": 0.37518271803855896, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 260 + }, + { + "epoch": 0.3562005277044855, + "grad_norm": 0.35586467385292053, + "learning_rate": 0.0002, + "loss": 1.2004, + "step": 270 + }, + { + "epoch": 0.36939313984168864, + "grad_norm": 0.32441186904907227, + "learning_rate": 0.0002, + "loss": 1.2609, + "step": 280 + }, + { + "epoch": 0.38258575197889183, + "grad_norm": 0.3198683261871338, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 290 + }, + { + "epoch": 0.39577836411609496, + "grad_norm": 0.33663108944892883, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 300 + }, + { + "epoch": 0.40897097625329815, + "grad_norm": 0.3711244761943817, + "learning_rate": 0.0002, + "loss": 1.1702, + "step": 310 + }, + { + "epoch": 0.42216358839050133, + "grad_norm": 0.3209651708602905, + "learning_rate": 0.0002, + "loss": 1.0871, + "step": 320 + }, + { + "epoch": 0.43535620052770446, + "grad_norm": 0.5152716040611267, + "learning_rate": 0.0002, + "loss": 1.2728, + "step": 330 + }, + { + "epoch": 0.44854881266490765, + "grad_norm": 0.5431376695632935, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 340 + }, + { + "epoch": 0.46174142480211083, + "grad_norm": 0.3069997727870941, + "learning_rate": 0.0002, + "loss": 1.1344, + "step": 350 + }, + { + "epoch": 0.47493403693931396, + "grad_norm": 0.34260064363479614, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 360 + }, + { + "epoch": 0.48812664907651715, + "grad_norm": 0.345653235912323, + "learning_rate": 0.0002, + "loss": 1.3161, + "step": 370 + }, + { + "epoch": 0.5013192612137203, + "grad_norm": 0.46222734451293945, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 380 + }, + { + "epoch": 0.5145118733509235, + "grad_norm": 0.27301734685897827, + "learning_rate": 0.0002, + "loss": 1.0616, + "step": 390 + }, + { + "epoch": 0.5277044854881267, + "grad_norm": 0.29048439860343933, + "learning_rate": 0.0002, + "loss": 1.1944, + "step": 400 + }, + { + "epoch": 0.5408970976253298, + "grad_norm": 0.32927802205085754, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 410 + }, + { + "epoch": 0.554089709762533, + "grad_norm": 0.3336397409439087, + "learning_rate": 0.0002, + "loss": 1.2362, + "step": 420 + }, + { + "epoch": 0.5672823218997362, + "grad_norm": 0.4007597267627716, + "learning_rate": 0.0002, + "loss": 1.1445, + "step": 430 + }, + { + "epoch": 0.5804749340369393, + "grad_norm": 0.36144956946372986, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 440 + }, + { + "epoch": 0.5936675461741425, + "grad_norm": 0.6331009864807129, + "learning_rate": 0.0002, + "loss": 1.1604, + "step": 450 + }, + { + "epoch": 0.6068601583113457, + "grad_norm": 0.41469088196754456, + "learning_rate": 0.0002, + "loss": 1.1692, + "step": 460 + }, + { + "epoch": 0.6200527704485488, + "grad_norm": 0.4388185143470764, + "learning_rate": 0.0002, + "loss": 1.2372, + "step": 470 + }, + { + "epoch": 0.633245382585752, + "grad_norm": 0.3738141655921936, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 480 + }, + { + "epoch": 0.6464379947229552, + "grad_norm": 0.7212023138999939, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 490 + }, + { + "epoch": 0.6596306068601583, + "grad_norm": 0.2972351014614105, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 500 + }, + { + "epoch": 0.6728232189973615, + "grad_norm": 0.45293179154396057, + "learning_rate": 0.0002, + "loss": 1.1739, + "step": 510 + }, + { + "epoch": 0.6860158311345647, + "grad_norm": 0.4319860637187958, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 520 + }, + { + "epoch": 0.6992084432717678, + "grad_norm": 0.3050215542316437, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 530 + }, + { + "epoch": 0.712401055408971, + "grad_norm": 0.3552611172199249, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 540 + }, + { + "epoch": 0.7255936675461742, + "grad_norm": 0.3631151020526886, + "learning_rate": 0.0002, + "loss": 1.2546, + "step": 550 + }, + { + "epoch": 0.7387862796833773, + "grad_norm": 0.28177931904792786, + "learning_rate": 0.0002, + "loss": 1.1371, + "step": 560 + }, + { + "epoch": 0.7519788918205804, + "grad_norm": 0.359764039516449, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 570 + }, + { + "epoch": 0.7651715039577837, + "grad_norm": 0.3970327377319336, + "learning_rate": 0.0002, + "loss": 1.1356, + "step": 580 + }, + { + "epoch": 0.7783641160949868, + "grad_norm": 0.3541001081466675, + "learning_rate": 0.0002, + "loss": 1.309, + "step": 590 + }, + { + "epoch": 0.7915567282321899, + "grad_norm": 0.3478573262691498, + "learning_rate": 0.0002, + "loss": 1.2061, + "step": 600 + }, + { + "epoch": 0.8047493403693932, + "grad_norm": 0.3900321424007416, + "learning_rate": 0.0002, + "loss": 1.1864, + "step": 610 + }, + { + "epoch": 0.8179419525065963, + "grad_norm": 0.3443238437175751, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 620 + }, + { + "epoch": 0.8311345646437994, + "grad_norm": 0.44238781929016113, + "learning_rate": 0.0002, + "loss": 1.3232, + "step": 630 + }, + { + "epoch": 0.8443271767810027, + "grad_norm": 0.36339467763900757, + "learning_rate": 0.0002, + "loss": 1.1247, + "step": 640 + }, + { + "epoch": 0.8575197889182058, + "grad_norm": 0.6243070363998413, + "learning_rate": 0.0002, + "loss": 1.1049, + "step": 650 + }, + { + "epoch": 0.8707124010554089, + "grad_norm": 0.3209173381328583, + "learning_rate": 0.0002, + "loss": 1.1943, + "step": 660 + }, + { + "epoch": 0.8839050131926122, + "grad_norm": 0.35017991065979004, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 670 + }, + { + "epoch": 0.8970976253298153, + "grad_norm": 0.3247159421443939, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 680 + }, + { + "epoch": 0.9102902374670184, + "grad_norm": 0.4091894030570984, + "learning_rate": 0.0002, + "loss": 1.2298, + "step": 690 + }, + { + "epoch": 0.9234828496042217, + "grad_norm": 0.3975585997104645, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 700 + }, + { + "epoch": 0.9366754617414248, + "grad_norm": 0.3666245937347412, + "learning_rate": 0.0002, + "loss": 1.281, + "step": 710 + }, + { + "epoch": 0.9498680738786279, + "grad_norm": 0.45216917991638184, + "learning_rate": 0.0002, + "loss": 1.2855, + "step": 720 + }, + { + "epoch": 0.9630606860158312, + "grad_norm": 0.36108118295669556, + "learning_rate": 0.0002, + "loss": 1.2217, + "step": 730 + }, + { + "epoch": 0.9762532981530343, + "grad_norm": 0.44550251960754395, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 740 + }, + { + "epoch": 0.9894459102902374, + "grad_norm": 0.29801255464553833, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 750 + }, + { + "epoch": 1.0, + "eval_loss": 1.203244686126709, + "eval_runtime": 76.0457, + "eval_samples_per_second": 5.668, + "eval_steps_per_second": 0.71, + "step": 758 + }, + { + "epoch": 1.0026385224274406, + "grad_norm": 0.4096551239490509, + "learning_rate": 0.0002, + "loss": 1.0748, + "step": 760 + }, + { + "epoch": 1.0158311345646438, + "grad_norm": 0.2649582326412201, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 770 + }, + { + "epoch": 1.029023746701847, + "grad_norm": 0.3100722134113312, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 780 + }, + { + "epoch": 1.04221635883905, + "grad_norm": 0.3911755383014679, + "learning_rate": 0.0002, + "loss": 1.1736, + "step": 790 + }, + { + "epoch": 1.0554089709762533, + "grad_norm": 0.4600953757762909, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 800 + }, + { + "epoch": 1.0686015831134565, + "grad_norm": 0.28671619296073914, + "learning_rate": 0.0002, + "loss": 1.2869, + "step": 810 + }, + { + "epoch": 1.0817941952506596, + "grad_norm": 0.47282642126083374, + "learning_rate": 0.0002, + "loss": 1.1441, + "step": 820 + }, + { + "epoch": 1.0949868073878628, + "grad_norm": 0.690073549747467, + "learning_rate": 0.0002, + "loss": 1.1368, + "step": 830 + }, + { + "epoch": 1.108179419525066, + "grad_norm": 0.7317902445793152, + "learning_rate": 0.0002, + "loss": 0.9944, + "step": 840 + }, + { + "epoch": 1.121372031662269, + "grad_norm": 0.44215938448905945, + "learning_rate": 0.0002, + "loss": 1.108, + "step": 850 + }, + { + "epoch": 1.1345646437994723, + "grad_norm": 0.33875149488449097, + "learning_rate": 0.0002, + "loss": 1.0558, + "step": 860 + }, + { + "epoch": 1.1477572559366755, + "grad_norm": 0.3700002431869507, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 870 + }, + { + "epoch": 1.1609498680738786, + "grad_norm": 0.41173291206359863, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 880 + }, + { + "epoch": 1.1741424802110818, + "grad_norm": 0.5253589749336243, + "learning_rate": 0.0002, + "loss": 1.0312, + "step": 890 + }, + { + "epoch": 1.187335092348285, + "grad_norm": 0.3912237286567688, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 900 + }, + { + "epoch": 1.200527704485488, + "grad_norm": 0.40990331768989563, + "learning_rate": 0.0002, + "loss": 0.9434, + "step": 910 + }, + { + "epoch": 1.2137203166226913, + "grad_norm": 0.40377968549728394, + "learning_rate": 0.0002, + "loss": 1.0493, + "step": 920 + }, + { + "epoch": 1.2269129287598945, + "grad_norm": 0.4605846405029297, + "learning_rate": 0.0002, + "loss": 1.1496, + "step": 930 + }, + { + "epoch": 1.2401055408970976, + "grad_norm": 0.31564897298812866, + "learning_rate": 0.0002, + "loss": 1.1173, + "step": 940 + }, + { + "epoch": 1.2532981530343008, + "grad_norm": 0.39808550477027893, + "learning_rate": 0.0002, + "loss": 1.0547, + "step": 950 + }, + { + "epoch": 1.266490765171504, + "grad_norm": 0.3762115240097046, + "learning_rate": 0.0002, + "loss": 1.1367, + "step": 960 + }, + { + "epoch": 1.279683377308707, + "grad_norm": 0.4174984097480774, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 970 + }, + { + "epoch": 1.2928759894459103, + "grad_norm": 0.5263054966926575, + "learning_rate": 0.0002, + "loss": 1.1327, + "step": 980 + }, + { + "epoch": 1.3060686015831133, + "grad_norm": 0.41673699021339417, + "learning_rate": 0.0002, + "loss": 1.0339, + "step": 990 + }, + { + "epoch": 1.3192612137203166, + "grad_norm": 0.9613684415817261, + "learning_rate": 0.0002, + "loss": 1.1198, + "step": 1000 + }, + { + "epoch": 1.3324538258575198, + "grad_norm": 0.3690216839313507, + "learning_rate": 0.0002, + "loss": 1.0444, + "step": 1010 + }, + { + "epoch": 1.345646437994723, + "grad_norm": 0.521821141242981, + "learning_rate": 0.0002, + "loss": 1.0473, + "step": 1020 + }, + { + "epoch": 1.358839050131926, + "grad_norm": 0.3353094160556793, + "learning_rate": 0.0002, + "loss": 1.1065, + "step": 1030 + }, + { + "epoch": 1.3720316622691293, + "grad_norm": 0.3843843936920166, + "learning_rate": 0.0002, + "loss": 1.1286, + "step": 1040 + }, + { + "epoch": 1.3852242744063323, + "grad_norm": 0.372514545917511, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 1050 + }, + { + "epoch": 1.3984168865435356, + "grad_norm": 0.34537771344184875, + "learning_rate": 0.0002, + "loss": 1.0041, + "step": 1060 + }, + { + "epoch": 1.4116094986807388, + "grad_norm": 0.45349085330963135, + "learning_rate": 0.0002, + "loss": 1.0251, + "step": 1070 + }, + { + "epoch": 1.424802110817942, + "grad_norm": 0.5120177268981934, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 1080 + }, + { + "epoch": 1.437994722955145, + "grad_norm": 0.42800238728523254, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 1090 + }, + { + "epoch": 1.4511873350923483, + "grad_norm": 0.343832790851593, + "learning_rate": 0.0002, + "loss": 1.0734, + "step": 1100 + }, + { + "epoch": 1.4643799472295513, + "grad_norm": 0.3829841911792755, + "learning_rate": 0.0002, + "loss": 1.1286, + "step": 1110 + }, + { + "epoch": 1.4775725593667546, + "grad_norm": 0.4289931058883667, + "learning_rate": 0.0002, + "loss": 1.1268, + "step": 1120 + }, + { + "epoch": 1.4907651715039578, + "grad_norm": 0.42750850319862366, + "learning_rate": 0.0002, + "loss": 1.0676, + "step": 1130 + }, + { + "epoch": 1.503957783641161, + "grad_norm": 0.34328413009643555, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 1140 + }, + { + "epoch": 1.517150395778364, + "grad_norm": 0.349096417427063, + "learning_rate": 0.0002, + "loss": 1.0863, + "step": 1150 + }, + { + "epoch": 1.5303430079155673, + "grad_norm": 0.7700717449188232, + "learning_rate": 0.0002, + "loss": 1.1462, + "step": 1160 + }, + { + "epoch": 1.5435356200527703, + "grad_norm": 0.39294949173927307, + "learning_rate": 0.0002, + "loss": 0.9528, + "step": 1170 + }, + { + "epoch": 1.5567282321899736, + "grad_norm": 0.36173608899116516, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 1180 + }, + { + "epoch": 1.5699208443271768, + "grad_norm": 0.6034277677536011, + "learning_rate": 0.0002, + "loss": 1.1508, + "step": 1190 + }, + { + "epoch": 1.58311345646438, + "grad_norm": 0.36694103479385376, + "learning_rate": 0.0002, + "loss": 1.0105, + "step": 1200 + }, + { + "epoch": 1.596306068601583, + "grad_norm": 0.4727209508419037, + "learning_rate": 0.0002, + "loss": 1.1479, + "step": 1210 + }, + { + "epoch": 1.6094986807387863, + "grad_norm": 0.6482883095741272, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 1220 + }, + { + "epoch": 1.6226912928759893, + "grad_norm": 0.5238035917282104, + "learning_rate": 0.0002, + "loss": 1.1405, + "step": 1230 + }, + { + "epoch": 1.6358839050131926, + "grad_norm": 0.4812222421169281, + "learning_rate": 0.0002, + "loss": 1.0596, + "step": 1240 + }, + { + "epoch": 1.6490765171503958, + "grad_norm": 0.7131702303886414, + "learning_rate": 0.0002, + "loss": 1.2729, + "step": 1250 + }, + { + "epoch": 1.662269129287599, + "grad_norm": 0.3803327977657318, + "learning_rate": 0.0002, + "loss": 0.9832, + "step": 1260 + }, + { + "epoch": 1.675461741424802, + "grad_norm": 0.3745088577270508, + "learning_rate": 0.0002, + "loss": 1.1433, + "step": 1270 + }, + { + "epoch": 1.6886543535620053, + "grad_norm": 0.4427378475666046, + "learning_rate": 0.0002, + "loss": 1.1018, + "step": 1280 + }, + { + "epoch": 1.7018469656992083, + "grad_norm": 0.797478973865509, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 1290 + }, + { + "epoch": 1.7150395778364116, + "grad_norm": 0.503620982170105, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 1300 + }, + { + "epoch": 1.7282321899736148, + "grad_norm": 0.4132426381111145, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 1310 + }, + { + "epoch": 1.741424802110818, + "grad_norm": 0.41811656951904297, + "learning_rate": 0.0002, + "loss": 1.172, + "step": 1320 + }, + { + "epoch": 1.754617414248021, + "grad_norm": 0.40647849440574646, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1330 + }, + { + "epoch": 1.767810026385224, + "grad_norm": 0.42138347029685974, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 1340 + }, + { + "epoch": 1.7810026385224274, + "grad_norm": 0.46523579955101013, + "learning_rate": 0.0002, + "loss": 0.9955, + "step": 1350 + }, + { + "epoch": 1.7941952506596306, + "grad_norm": 0.39760419726371765, + "learning_rate": 0.0002, + "loss": 1.0307, + "step": 1360 + }, + { + "epoch": 1.8073878627968338, + "grad_norm": 0.37993717193603516, + "learning_rate": 0.0002, + "loss": 1.1218, + "step": 1370 + }, + { + "epoch": 1.820580474934037, + "grad_norm": 0.5404181480407715, + "learning_rate": 0.0002, + "loss": 1.0921, + "step": 1380 + }, + { + "epoch": 1.83377308707124, + "grad_norm": 0.4385245740413666, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1390 + }, + { + "epoch": 1.8469656992084431, + "grad_norm": 0.529797375202179, + "learning_rate": 0.0002, + "loss": 1.1175, + "step": 1400 + }, + { + "epoch": 1.8601583113456464, + "grad_norm": 0.481567919254303, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 1410 + }, + { + "epoch": 1.8733509234828496, + "grad_norm": 0.34787362813949585, + "learning_rate": 0.0002, + "loss": 1.0581, + "step": 1420 + }, + { + "epoch": 1.8865435356200528, + "grad_norm": 0.6402362585067749, + "learning_rate": 0.0002, + "loss": 1.0536, + "step": 1430 + }, + { + "epoch": 1.899736147757256, + "grad_norm": 0.3461322784423828, + "learning_rate": 0.0002, + "loss": 1.0787, + "step": 1440 + }, + { + "epoch": 1.912928759894459, + "grad_norm": 0.44005653262138367, + "learning_rate": 0.0002, + "loss": 1.0925, + "step": 1450 + }, + { + "epoch": 1.9261213720316621, + "grad_norm": 0.4064280688762665, + "learning_rate": 0.0002, + "loss": 1.0414, + "step": 1460 + }, + { + "epoch": 1.9393139841688654, + "grad_norm": 0.5236523151397705, + "learning_rate": 0.0002, + "loss": 1.0608, + "step": 1470 + }, + { + "epoch": 1.9525065963060686, + "grad_norm": 0.41030219197273254, + "learning_rate": 0.0002, + "loss": 1.0572, + "step": 1480 + }, + { + "epoch": 1.9656992084432718, + "grad_norm": 0.39805835485458374, + "learning_rate": 0.0002, + "loss": 1.1204, + "step": 1490 + }, + { + "epoch": 1.978891820580475, + "grad_norm": 0.42974501848220825, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 1500 + }, + { + "epoch": 1.992084432717678, + "grad_norm": 0.4688243865966797, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 1510 + }, + { + "epoch": 2.0, + "eval_loss": 1.1874967813491821, + "eval_runtime": 71.9523, + "eval_samples_per_second": 5.99, + "eval_steps_per_second": 0.75, + "step": 1516 + }, + { + "epoch": 2.005277044854881, + "grad_norm": 0.4121631383895874, + "learning_rate": 0.0002, + "loss": 1.0401, + "step": 1520 + }, + { + "epoch": 2.0184696569920844, + "grad_norm": 0.4844197928905487, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 1530 + }, + { + "epoch": 2.0316622691292876, + "grad_norm": 0.45408546924591064, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 1540 + }, + { + "epoch": 2.044854881266491, + "grad_norm": 0.48662951588630676, + "learning_rate": 0.0002, + "loss": 1.0399, + "step": 1550 + }, + { + "epoch": 2.058047493403694, + "grad_norm": 0.7195899486541748, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 1560 + }, + { + "epoch": 2.0712401055408973, + "grad_norm": 0.5071077346801758, + "learning_rate": 0.0002, + "loss": 0.9486, + "step": 1570 + }, + { + "epoch": 2.0844327176781, + "grad_norm": 0.7473958730697632, + "learning_rate": 0.0002, + "loss": 1.0055, + "step": 1580 + }, + { + "epoch": 2.0976253298153034, + "grad_norm": 0.5509232878684998, + "learning_rate": 0.0002, + "loss": 0.8309, + "step": 1590 + }, + { + "epoch": 2.1108179419525066, + "grad_norm": 0.5108042359352112, + "learning_rate": 0.0002, + "loss": 0.9181, + "step": 1600 + }, + { + "epoch": 2.12401055408971, + "grad_norm": 0.42331448197364807, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 1610 + }, + { + "epoch": 2.137203166226913, + "grad_norm": 0.46621623635292053, + "learning_rate": 0.0002, + "loss": 0.9359, + "step": 1620 + }, + { + "epoch": 2.150395778364116, + "grad_norm": 0.43802836537361145, + "learning_rate": 0.0002, + "loss": 0.9065, + "step": 1630 + }, + { + "epoch": 2.163588390501319, + "grad_norm": 0.49908021092414856, + "learning_rate": 0.0002, + "loss": 0.9375, + "step": 1640 + }, + { + "epoch": 2.1767810026385224, + "grad_norm": 0.4195636808872223, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 1650 + }, + { + "epoch": 2.1899736147757256, + "grad_norm": 0.49515822529792786, + "learning_rate": 0.0002, + "loss": 0.8501, + "step": 1660 + }, + { + "epoch": 2.203166226912929, + "grad_norm": 0.4607589542865753, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 1670 + }, + { + "epoch": 2.216358839050132, + "grad_norm": 0.4489196836948395, + "learning_rate": 0.0002, + "loss": 1.0206, + "step": 1680 + }, + { + "epoch": 2.229551451187335, + "grad_norm": 0.49300864338874817, + "learning_rate": 0.0002, + "loss": 0.9657, + "step": 1690 + }, + { + "epoch": 2.242744063324538, + "grad_norm": 0.6624954342842102, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 1700 + }, + { + "epoch": 2.2559366754617414, + "grad_norm": 0.8391500115394592, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 1710 + }, + { + "epoch": 2.2691292875989446, + "grad_norm": 0.5193073749542236, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 1720 + }, + { + "epoch": 2.282321899736148, + "grad_norm": 0.6180613040924072, + "learning_rate": 0.0002, + "loss": 0.9979, + "step": 1730 + }, + { + "epoch": 2.295514511873351, + "grad_norm": 0.591191291809082, + "learning_rate": 0.0002, + "loss": 0.9579, + "step": 1740 + }, + { + "epoch": 2.308707124010554, + "grad_norm": 0.546897828578949, + "learning_rate": 0.0002, + "loss": 0.8879, + "step": 1750 + }, + { + "epoch": 2.321899736147757, + "grad_norm": 0.5470401644706726, + "learning_rate": 0.0002, + "loss": 0.9321, + "step": 1760 + }, + { + "epoch": 2.3350923482849604, + "grad_norm": 0.4590282738208771, + "learning_rate": 0.0002, + "loss": 0.9104, + "step": 1770 + }, + { + "epoch": 2.3482849604221636, + "grad_norm": 0.622164785861969, + "learning_rate": 0.0002, + "loss": 1.0384, + "step": 1780 + }, + { + "epoch": 2.361477572559367, + "grad_norm": 0.5753812193870544, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 1790 + }, + { + "epoch": 2.37467018469657, + "grad_norm": 0.47958624362945557, + "learning_rate": 0.0002, + "loss": 0.9569, + "step": 1800 + }, + { + "epoch": 2.387862796833773, + "grad_norm": 0.48042672872543335, + "learning_rate": 0.0002, + "loss": 1.0519, + "step": 1810 + }, + { + "epoch": 2.401055408970976, + "grad_norm": 0.44586366415023804, + "learning_rate": 0.0002, + "loss": 0.9042, + "step": 1820 + }, + { + "epoch": 2.4142480211081794, + "grad_norm": 0.7239416837692261, + "learning_rate": 0.0002, + "loss": 0.9783, + "step": 1830 + }, + { + "epoch": 2.4274406332453826, + "grad_norm": 0.5515341758728027, + "learning_rate": 0.0002, + "loss": 0.8818, + "step": 1840 + }, + { + "epoch": 2.440633245382586, + "grad_norm": 0.6280064582824707, + "learning_rate": 0.0002, + "loss": 0.9503, + "step": 1850 + }, + { + "epoch": 2.453825857519789, + "grad_norm": 0.4832057058811188, + "learning_rate": 0.0002, + "loss": 0.8943, + "step": 1860 + }, + { + "epoch": 2.467018469656992, + "grad_norm": 0.5789321064949036, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 1870 + }, + { + "epoch": 2.480211081794195, + "grad_norm": 0.48491886258125305, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 1880 + }, + { + "epoch": 2.4934036939313984, + "grad_norm": 0.532365620136261, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 1890 + }, + { + "epoch": 2.5065963060686016, + "grad_norm": 0.7087852954864502, + "learning_rate": 0.0002, + "loss": 0.9734, + "step": 1900 + }, + { + "epoch": 2.519788918205805, + "grad_norm": 0.48157402873039246, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 1910 + }, + { + "epoch": 2.532981530343008, + "grad_norm": 0.5886041522026062, + "learning_rate": 0.0002, + "loss": 0.9851, + "step": 1920 + }, + { + "epoch": 2.5461741424802113, + "grad_norm": 0.6332622766494751, + "learning_rate": 0.0002, + "loss": 1.0144, + "step": 1930 + }, + { + "epoch": 2.559366754617414, + "grad_norm": 0.5463117957115173, + "learning_rate": 0.0002, + "loss": 0.9516, + "step": 1940 + }, + { + "epoch": 2.5725593667546174, + "grad_norm": 0.5432228446006775, + "learning_rate": 0.0002, + "loss": 0.9373, + "step": 1950 + }, + { + "epoch": 2.5857519788918206, + "grad_norm": 0.5929186940193176, + "learning_rate": 0.0002, + "loss": 0.8974, + "step": 1960 + }, + { + "epoch": 2.598944591029024, + "grad_norm": 0.5120641589164734, + "learning_rate": 0.0002, + "loss": 1.0062, + "step": 1970 + }, + { + "epoch": 2.6121372031662267, + "grad_norm": 0.5372339487075806, + "learning_rate": 0.0002, + "loss": 0.9143, + "step": 1980 + }, + { + "epoch": 2.62532981530343, + "grad_norm": 0.5519838929176331, + "learning_rate": 0.0002, + "loss": 1.0786, + "step": 1990 + }, + { + "epoch": 2.638522427440633, + "grad_norm": 0.7304037809371948, + "learning_rate": 0.0002, + "loss": 1.021, + "step": 2000 + }, + { + "epoch": 2.6517150395778364, + "grad_norm": 0.6182340979576111, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 2010 + }, + { + "epoch": 2.6649076517150396, + "grad_norm": 0.4874444305896759, + "learning_rate": 0.0002, + "loss": 1.0501, + "step": 2020 + }, + { + "epoch": 2.678100263852243, + "grad_norm": 0.5850239396095276, + "learning_rate": 0.0002, + "loss": 0.9612, + "step": 2030 + }, + { + "epoch": 2.691292875989446, + "grad_norm": 0.6495311856269836, + "learning_rate": 0.0002, + "loss": 0.9808, + "step": 2040 + }, + { + "epoch": 2.7044854881266494, + "grad_norm": 1.002830147743225, + "learning_rate": 0.0002, + "loss": 1.0187, + "step": 2050 + }, + { + "epoch": 2.717678100263852, + "grad_norm": 0.49076753854751587, + "learning_rate": 0.0002, + "loss": 1.0386, + "step": 2060 + }, + { + "epoch": 2.7308707124010554, + "grad_norm": 0.4736326336860657, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 2070 + }, + { + "epoch": 2.7440633245382586, + "grad_norm": 0.5527601838111877, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 2080 + }, + { + "epoch": 2.757255936675462, + "grad_norm": 0.7295718193054199, + "learning_rate": 0.0002, + "loss": 0.9149, + "step": 2090 + }, + { + "epoch": 2.7704485488126647, + "grad_norm": 0.5437536835670471, + "learning_rate": 0.0002, + "loss": 0.9032, + "step": 2100 + }, + { + "epoch": 2.783641160949868, + "grad_norm": 0.5997128486633301, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2110 + }, + { + "epoch": 2.796833773087071, + "grad_norm": 0.6498191356658936, + "learning_rate": 0.0002, + "loss": 0.976, + "step": 2120 + }, + { + "epoch": 2.8100263852242744, + "grad_norm": 0.5237268805503845, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 2130 + }, + { + "epoch": 2.8232189973614776, + "grad_norm": 0.6033027172088623, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 2140 + }, + { + "epoch": 2.836411609498681, + "grad_norm": 0.6077138781547546, + "learning_rate": 0.0002, + "loss": 0.9625, + "step": 2150 + }, + { + "epoch": 2.849604221635884, + "grad_norm": 0.4127797484397888, + "learning_rate": 0.0002, + "loss": 0.9347, + "step": 2160 + }, + { + "epoch": 2.862796833773087, + "grad_norm": 0.8448635339736938, + "learning_rate": 0.0002, + "loss": 1.0459, + "step": 2170 + }, + { + "epoch": 2.87598944591029, + "grad_norm": 0.5669729113578796, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 2180 + }, + { + "epoch": 2.8891820580474934, + "grad_norm": 0.510231077671051, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 2190 + }, + { + "epoch": 2.9023746701846966, + "grad_norm": 0.8072245121002197, + "learning_rate": 0.0002, + "loss": 0.9267, + "step": 2200 + }, + { + "epoch": 2.9155672823219, + "grad_norm": 0.6055923104286194, + "learning_rate": 0.0002, + "loss": 0.9685, + "step": 2210 + }, + { + "epoch": 2.9287598944591027, + "grad_norm": 0.7384416460990906, + "learning_rate": 0.0002, + "loss": 0.9157, + "step": 2220 + }, + { + "epoch": 2.941952506596306, + "grad_norm": 0.4922751784324646, + "learning_rate": 0.0002, + "loss": 0.9712, + "step": 2230 + }, + { + "epoch": 2.955145118733509, + "grad_norm": 0.6039906740188599, + "learning_rate": 0.0002, + "loss": 0.9487, + "step": 2240 + }, + { + "epoch": 2.9683377308707124, + "grad_norm": 0.4751701354980469, + "learning_rate": 0.0002, + "loss": 0.9776, + "step": 2250 + }, + { + "epoch": 2.9815303430079156, + "grad_norm": 0.5698353052139282, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 2260 + }, + { + "epoch": 2.994722955145119, + "grad_norm": 0.893563449382782, + "learning_rate": 0.0002, + "loss": 1.1184, + "step": 2270 + }, + { + "epoch": 3.0, + "eval_loss": 1.2046419382095337, + "eval_runtime": 71.5992, + "eval_samples_per_second": 6.02, + "eval_steps_per_second": 0.754, + "step": 2274 + }, + { + "epoch": 3.007915567282322, + "grad_norm": 0.41119325160980225, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 2280 + }, + { + "epoch": 3.021108179419525, + "grad_norm": 0.8169420957565308, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 2290 + }, + { + "epoch": 3.034300791556728, + "grad_norm": 0.6033818125724792, + "learning_rate": 0.0002, + "loss": 0.794, + "step": 2300 + }, + { + "epoch": 3.0474934036939314, + "grad_norm": 0.9600058197975159, + "learning_rate": 0.0002, + "loss": 0.7607, + "step": 2310 + }, + { + "epoch": 3.0606860158311346, + "grad_norm": 0.5859250426292419, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 2320 + }, + { + "epoch": 3.073878627968338, + "grad_norm": 0.6758618950843811, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 2330 + }, + { + "epoch": 3.0870712401055407, + "grad_norm": 0.8407140970230103, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 2340 + }, + { + "epoch": 3.100263852242744, + "grad_norm": 0.767779529094696, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 2350 + }, + { + "epoch": 3.113456464379947, + "grad_norm": 0.5572896599769592, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 2360 + }, + { + "epoch": 3.1266490765171504, + "grad_norm": 0.5908368825912476, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 2370 + }, + { + "epoch": 3.1398416886543536, + "grad_norm": 0.8047826290130615, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 2380 + }, + { + "epoch": 3.153034300791557, + "grad_norm": 0.8041718006134033, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 2390 + }, + { + "epoch": 3.16622691292876, + "grad_norm": 0.57078617811203, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 2400 + }, + { + "epoch": 3.179419525065963, + "grad_norm": 0.5125322937965393, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 2410 + }, + { + "epoch": 3.192612137203166, + "grad_norm": 0.6356934309005737, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 2420 + }, + { + "epoch": 3.2058047493403694, + "grad_norm": 1.0129680633544922, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 2430 + }, + { + "epoch": 3.2189973614775726, + "grad_norm": 0.8104226589202881, + "learning_rate": 0.0002, + "loss": 0.8316, + "step": 2440 + }, + { + "epoch": 3.232189973614776, + "grad_norm": 0.7276079058647156, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 2450 + }, + { + "epoch": 3.2453825857519787, + "grad_norm": 0.9753884077072144, + "learning_rate": 0.0002, + "loss": 0.8183, + "step": 2460 + }, + { + "epoch": 3.258575197889182, + "grad_norm": 0.9753183722496033, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 2470 + }, + { + "epoch": 3.271767810026385, + "grad_norm": 0.6791225075721741, + "learning_rate": 0.0002, + "loss": 0.8815, + "step": 2480 + }, + { + "epoch": 3.2849604221635884, + "grad_norm": 0.6797150373458862, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 2490 + }, + { + "epoch": 3.2981530343007917, + "grad_norm": 0.8107194900512695, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 2500 + }, + { + "epoch": 3.311345646437995, + "grad_norm": 0.5878375172615051, + "learning_rate": 0.0002, + "loss": 0.7869, + "step": 2510 + }, + { + "epoch": 3.324538258575198, + "grad_norm": 0.5882975459098816, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 2520 + }, + { + "epoch": 3.337730870712401, + "grad_norm": 0.6180013418197632, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 2530 + }, + { + "epoch": 3.350923482849604, + "grad_norm": 1.0008151531219482, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 2540 + }, + { + "epoch": 3.3641160949868074, + "grad_norm": 0.6404656767845154, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 2550 + }, + { + "epoch": 3.3773087071240107, + "grad_norm": 0.8481354117393494, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 2560 + }, + { + "epoch": 3.390501319261214, + "grad_norm": 0.8068035244941711, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 2570 + }, + { + "epoch": 3.4036939313984167, + "grad_norm": 0.7477166056632996, + "learning_rate": 0.0002, + "loss": 0.8621, + "step": 2580 + }, + { + "epoch": 3.41688654353562, + "grad_norm": 0.6202635765075684, + "learning_rate": 0.0002, + "loss": 0.8352, + "step": 2590 + }, + { + "epoch": 3.430079155672823, + "grad_norm": 0.6981159448623657, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 2600 + }, + { + "epoch": 3.4432717678100264, + "grad_norm": 0.6611084342002869, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 2610 + }, + { + "epoch": 3.4564643799472297, + "grad_norm": 0.5727696418762207, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 2620 + }, + { + "epoch": 3.469656992084433, + "grad_norm": 1.2354545593261719, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 2630 + }, + { + "epoch": 3.4828496042216357, + "grad_norm": 0.6347638368606567, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 2640 + }, + { + "epoch": 3.496042216358839, + "grad_norm": 0.6975704431533813, + "learning_rate": 0.0002, + "loss": 0.8426, + "step": 2650 + }, + { + "epoch": 3.509234828496042, + "grad_norm": 0.6569573879241943, + "learning_rate": 0.0002, + "loss": 0.8773, + "step": 2660 + }, + { + "epoch": 3.5224274406332454, + "grad_norm": 0.6979609131813049, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 2670 + }, + { + "epoch": 3.5356200527704487, + "grad_norm": 0.6287988424301147, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2680 + }, + { + "epoch": 3.5488126649076515, + "grad_norm": 0.8682637214660645, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 2690 + }, + { + "epoch": 3.5620052770448547, + "grad_norm": 0.7062831521034241, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 2700 + }, + { + "epoch": 3.575197889182058, + "grad_norm": 1.0061452388763428, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 2710 + }, + { + "epoch": 3.588390501319261, + "grad_norm": 0.719097375869751, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 2720 + }, + { + "epoch": 3.6015831134564644, + "grad_norm": 0.7583496570587158, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 2730 + }, + { + "epoch": 3.6147757255936677, + "grad_norm": 0.7543531060218811, + "learning_rate": 0.0002, + "loss": 0.91, + "step": 2740 + }, + { + "epoch": 3.627968337730871, + "grad_norm": 0.8873646855354309, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2750 + }, + { + "epoch": 3.641160949868074, + "grad_norm": 1.0657562017440796, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 2760 + }, + { + "epoch": 3.654353562005277, + "grad_norm": 0.8641113638877869, + "learning_rate": 0.0002, + "loss": 0.8291, + "step": 2770 + }, + { + "epoch": 3.66754617414248, + "grad_norm": 0.6620645523071289, + "learning_rate": 0.0002, + "loss": 0.8302, + "step": 2780 + }, + { + "epoch": 3.6807387862796834, + "grad_norm": 0.6919541954994202, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 2790 + }, + { + "epoch": 3.6939313984168867, + "grad_norm": 0.7305743098258972, + "learning_rate": 0.0002, + "loss": 0.8388, + "step": 2800 + }, + { + "epoch": 3.7071240105540895, + "grad_norm": 0.7464777827262878, + "learning_rate": 0.0002, + "loss": 0.8053, + "step": 2810 + }, + { + "epoch": 3.7203166226912927, + "grad_norm": 0.8067063093185425, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 2820 + }, + { + "epoch": 3.733509234828496, + "grad_norm": 0.7789416313171387, + "learning_rate": 0.0002, + "loss": 0.8259, + "step": 2830 + }, + { + "epoch": 3.746701846965699, + "grad_norm": 0.507529079914093, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 2840 + }, + { + "epoch": 3.7598944591029024, + "grad_norm": 0.6509260535240173, + "learning_rate": 0.0002, + "loss": 0.832, + "step": 2850 + }, + { + "epoch": 3.7730870712401057, + "grad_norm": 0.9141367673873901, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 2860 + }, + { + "epoch": 3.786279683377309, + "grad_norm": 0.7852635979652405, + "learning_rate": 0.0002, + "loss": 0.9436, + "step": 2870 + }, + { + "epoch": 3.7994722955145117, + "grad_norm": 0.5340318083763123, + "learning_rate": 0.0002, + "loss": 0.8842, + "step": 2880 + }, + { + "epoch": 3.812664907651715, + "grad_norm": 0.6246042847633362, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 2890 + }, + { + "epoch": 3.825857519788918, + "grad_norm": 0.7064066529273987, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 2900 + }, + { + "epoch": 3.8390501319261214, + "grad_norm": 0.6144065856933594, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 2910 + }, + { + "epoch": 3.8522427440633247, + "grad_norm": 0.5268424153327942, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 2920 + }, + { + "epoch": 3.8654353562005275, + "grad_norm": 0.9508116841316223, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 2930 + }, + { + "epoch": 3.8786279683377307, + "grad_norm": 0.9133715629577637, + "learning_rate": 0.0002, + "loss": 0.8437, + "step": 2940 + }, + { + "epoch": 3.891820580474934, + "grad_norm": 1.0144646167755127, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 2950 + }, + { + "epoch": 3.905013192612137, + "grad_norm": 0.6397877931594849, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2960 + }, + { + "epoch": 3.9182058047493404, + "grad_norm": 0.734835147857666, + "learning_rate": 0.0002, + "loss": 0.8285, + "step": 2970 + }, + { + "epoch": 3.9313984168865437, + "grad_norm": 0.784853994846344, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 2980 + }, + { + "epoch": 3.944591029023747, + "grad_norm": 0.805831789970398, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 2990 + }, + { + "epoch": 3.9577836411609497, + "grad_norm": 0.6299595236778259, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 3000 + }, + { + "epoch": 3.970976253298153, + "grad_norm": 0.6264058351516724, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3010 + }, + { + "epoch": 3.984168865435356, + "grad_norm": 0.6419739723205566, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 3020 + }, + { + "epoch": 3.9973614775725594, + "grad_norm": 0.7737036943435669, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 3030 + }, + { + "epoch": 4.0, + "eval_loss": 1.2454297542572021, + "eval_runtime": 71.8558, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.752, + "step": 3032 + }, + { + "epoch": 4.010554089709762, + "grad_norm": 1.092727541923523, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 3040 + }, + { + "epoch": 4.0237467018469655, + "grad_norm": 0.8087759613990784, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 3050 + }, + { + "epoch": 4.036939313984169, + "grad_norm": 0.8106053471565247, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 3060 + }, + { + "epoch": 4.050131926121372, + "grad_norm": 0.8675326704978943, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 3070 + }, + { + "epoch": 4.063324538258575, + "grad_norm": 0.9620490074157715, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 3080 + }, + { + "epoch": 4.076517150395778, + "grad_norm": 0.8996296525001526, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 3090 + }, + { + "epoch": 4.089709762532982, + "grad_norm": 0.8648998737335205, + "learning_rate": 0.0002, + "loss": 0.6111, + "step": 3100 + }, + { + "epoch": 4.102902374670185, + "grad_norm": 1.0321335792541504, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 3110 + }, + { + "epoch": 4.116094986807388, + "grad_norm": 0.7949225306510925, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 3120 + }, + { + "epoch": 4.129287598944591, + "grad_norm": 0.9684646129608154, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 3130 + }, + { + "epoch": 4.142480211081795, + "grad_norm": 0.8698066473007202, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 3140 + }, + { + "epoch": 4.155672823218997, + "grad_norm": 0.7688450813293457, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 3150 + }, + { + "epoch": 4.1688654353562, + "grad_norm": 0.9682092070579529, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 3160 + }, + { + "epoch": 4.1820580474934035, + "grad_norm": 0.961561918258667, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 3170 + }, + { + "epoch": 4.195250659630607, + "grad_norm": 1.3962990045547485, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 3180 + }, + { + "epoch": 4.20844327176781, + "grad_norm": 0.9485045075416565, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 3190 + }, + { + "epoch": 4.221635883905013, + "grad_norm": 0.7768281698226929, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 3200 + }, + { + "epoch": 4.2348284960422165, + "grad_norm": 1.2685691118240356, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 3210 + }, + { + "epoch": 4.24802110817942, + "grad_norm": 0.6876471638679504, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 3220 + }, + { + "epoch": 4.261213720316623, + "grad_norm": 1.0074554681777954, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 3230 + }, + { + "epoch": 4.274406332453826, + "grad_norm": 0.8094777464866638, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 3240 + }, + { + "epoch": 4.287598944591029, + "grad_norm": 0.7906569242477417, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 3250 + }, + { + "epoch": 4.300791556728232, + "grad_norm": 0.840238630771637, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 3260 + }, + { + "epoch": 4.313984168865435, + "grad_norm": 1.0119295120239258, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 3270 + }, + { + "epoch": 4.327176781002638, + "grad_norm": 0.7943191528320312, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 3280 + }, + { + "epoch": 4.3403693931398415, + "grad_norm": 0.7691723704338074, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 3290 + }, + { + "epoch": 4.353562005277045, + "grad_norm": 0.7227770686149597, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 3300 + }, + { + "epoch": 4.366754617414248, + "grad_norm": 0.8512253165245056, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 3310 + }, + { + "epoch": 4.379947229551451, + "grad_norm": 0.7852529287338257, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 3320 + }, + { + "epoch": 4.3931398416886545, + "grad_norm": 0.8888797163963318, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 3330 + }, + { + "epoch": 4.406332453825858, + "grad_norm": 0.9522430896759033, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 3340 + }, + { + "epoch": 4.419525065963061, + "grad_norm": 0.900276780128479, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 3350 + }, + { + "epoch": 4.432717678100264, + "grad_norm": 1.181547999382019, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 3360 + }, + { + "epoch": 4.445910290237467, + "grad_norm": 0.903142511844635, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 3370 + }, + { + "epoch": 4.45910290237467, + "grad_norm": 0.8747565150260925, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 3380 + }, + { + "epoch": 4.472295514511873, + "grad_norm": 0.7838051319122314, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 3390 + }, + { + "epoch": 4.485488126649076, + "grad_norm": 0.8691313862800598, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 3400 + }, + { + "epoch": 4.4986807387862795, + "grad_norm": 0.8493868708610535, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 3410 + }, + { + "epoch": 4.511873350923483, + "grad_norm": 1.0104830265045166, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 3420 + }, + { + "epoch": 4.525065963060686, + "grad_norm": 1.1716967821121216, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 3430 + }, + { + "epoch": 4.538258575197889, + "grad_norm": 0.9122593998908997, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 3440 + }, + { + "epoch": 4.5514511873350925, + "grad_norm": 0.829090416431427, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 3450 + }, + { + "epoch": 4.564643799472296, + "grad_norm": 1.141662836074829, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 3460 + }, + { + "epoch": 4.577836411609499, + "grad_norm": 0.8423182368278503, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 3470 + }, + { + "epoch": 4.591029023746702, + "grad_norm": 0.8024184703826904, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 3480 + }, + { + "epoch": 4.6042216358839045, + "grad_norm": 0.7703381776809692, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 3490 + }, + { + "epoch": 4.617414248021108, + "grad_norm": 0.9883959293365479, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 3500 + }, + { + "epoch": 4.630606860158311, + "grad_norm": 0.9554709196090698, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 3510 + }, + { + "epoch": 4.643799472295514, + "grad_norm": 1.9949709177017212, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 3520 + }, + { + "epoch": 4.6569920844327175, + "grad_norm": 0.7762255072593689, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 3530 + }, + { + "epoch": 4.670184696569921, + "grad_norm": 0.9538425803184509, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 3540 + }, + { + "epoch": 4.683377308707124, + "grad_norm": 1.0279661417007446, + "learning_rate": 0.0002, + "loss": 0.6352, + "step": 3550 + }, + { + "epoch": 4.696569920844327, + "grad_norm": 0.7545472979545593, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 3560 + }, + { + "epoch": 4.7097625329815305, + "grad_norm": 0.8919376730918884, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 3570 + }, + { + "epoch": 4.722955145118734, + "grad_norm": 0.7621569633483887, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 3580 + }, + { + "epoch": 4.736147757255937, + "grad_norm": 1.205320119857788, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3590 + }, + { + "epoch": 4.74934036939314, + "grad_norm": 1.0642725229263306, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 3600 + }, + { + "epoch": 4.762532981530343, + "grad_norm": 0.9402666687965393, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 3610 + }, + { + "epoch": 4.775725593667546, + "grad_norm": 1.254127025604248, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 3620 + }, + { + "epoch": 4.788918205804749, + "grad_norm": 0.7609598636627197, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 3630 + }, + { + "epoch": 4.802110817941952, + "grad_norm": 0.8240329623222351, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 3640 + }, + { + "epoch": 4.8153034300791555, + "grad_norm": 0.8356260657310486, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 3650 + }, + { + "epoch": 4.828496042216359, + "grad_norm": 0.9130708575248718, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 3660 + }, + { + "epoch": 4.841688654353562, + "grad_norm": 0.9384765028953552, + "learning_rate": 0.0002, + "loss": 0.7269, + "step": 3670 + }, + { + "epoch": 4.854881266490765, + "grad_norm": 0.9829966425895691, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 3680 + }, + { + "epoch": 4.8680738786279685, + "grad_norm": 1.0488632917404175, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 3690 + }, + { + "epoch": 4.881266490765172, + "grad_norm": 1.2278969287872314, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 3700 + }, + { + "epoch": 4.894459102902375, + "grad_norm": 0.8078970313072205, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 3710 + }, + { + "epoch": 4.907651715039578, + "grad_norm": 0.8081700205802917, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 3720 + }, + { + "epoch": 4.9208443271767806, + "grad_norm": 0.9204511046409607, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 3730 + }, + { + "epoch": 4.934036939313984, + "grad_norm": 0.9326391220092773, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 3740 + }, + { + "epoch": 4.947229551451187, + "grad_norm": 1.0089969635009766, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 3750 + }, + { + "epoch": 4.96042216358839, + "grad_norm": 0.7063466906547546, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 3760 + }, + { + "epoch": 4.9736147757255935, + "grad_norm": 1.2603905200958252, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 3770 + }, + { + "epoch": 4.986807387862797, + "grad_norm": 0.8418653607368469, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 3780 + }, + { + "epoch": 5.0, + "grad_norm": 0.9537181854248047, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 3790 + }, + { + "epoch": 5.0, + "eval_loss": 1.3319307565689087, + "eval_runtime": 71.7836, + "eval_samples_per_second": 6.004, + "eval_steps_per_second": 0.752, + "step": 3790 + }, + { + "epoch": 5.013192612137203, + "grad_norm": 0.8595899343490601, + "learning_rate": 0.0002, + "loss": 0.489, + "step": 3800 + }, + { + "epoch": 5.0263852242744065, + "grad_norm": 1.0023565292358398, + "learning_rate": 0.0002, + "loss": 0.5155, + "step": 3810 + }, + { + "epoch": 5.03957783641161, + "grad_norm": 1.2770460844039917, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 3820 + }, + { + "epoch": 5.052770448548813, + "grad_norm": 1.1701956987380981, + "learning_rate": 0.0002, + "loss": 0.5127, + "step": 3830 + }, + { + "epoch": 5.065963060686016, + "grad_norm": 0.812269926071167, + "learning_rate": 0.0002, + "loss": 0.5057, + "step": 3840 + }, + { + "epoch": 5.0791556728232194, + "grad_norm": 0.8186697363853455, + "learning_rate": 0.0002, + "loss": 0.4292, + "step": 3850 + }, + { + "epoch": 5.092348284960422, + "grad_norm": 1.052565097808838, + "learning_rate": 0.0002, + "loss": 0.4865, + "step": 3860 + }, + { + "epoch": 5.105540897097625, + "grad_norm": 0.9764705300331116, + "learning_rate": 0.0002, + "loss": 0.4947, + "step": 3870 + }, + { + "epoch": 5.118733509234828, + "grad_norm": 0.6973426938056946, + "learning_rate": 0.0002, + "loss": 0.471, + "step": 3880 + }, + { + "epoch": 5.1319261213720315, + "grad_norm": 1.2127928733825684, + "learning_rate": 0.0002, + "loss": 0.5565, + "step": 3890 + }, + { + "epoch": 5.145118733509235, + "grad_norm": 0.682807981967926, + "learning_rate": 0.0002, + "loss": 0.4122, + "step": 3900 + }, + { + "epoch": 5.158311345646438, + "grad_norm": 1.3575998544692993, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 3910 + }, + { + "epoch": 5.171503957783641, + "grad_norm": 1.2581931352615356, + "learning_rate": 0.0002, + "loss": 0.4624, + "step": 3920 + }, + { + "epoch": 5.1846965699208445, + "grad_norm": 1.0493637323379517, + "learning_rate": 0.0002, + "loss": 0.5092, + "step": 3930 + }, + { + "epoch": 5.197889182058048, + "grad_norm": 1.3519670963287354, + "learning_rate": 0.0002, + "loss": 0.4563, + "step": 3940 + }, + { + "epoch": 5.211081794195251, + "grad_norm": 1.0690566301345825, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 3950 + }, + { + "epoch": 5.224274406332454, + "grad_norm": 1.1171330213546753, + "learning_rate": 0.0002, + "loss": 0.5038, + "step": 3960 + }, + { + "epoch": 5.237467018469657, + "grad_norm": 1.055851697921753, + "learning_rate": 0.0002, + "loss": 0.4397, + "step": 3970 + }, + { + "epoch": 5.25065963060686, + "grad_norm": 0.8870180249214172, + "learning_rate": 0.0002, + "loss": 0.4964, + "step": 3980 + }, + { + "epoch": 5.263852242744063, + "grad_norm": 0.9688402414321899, + "learning_rate": 0.0002, + "loss": 0.5353, + "step": 3990 + }, + { + "epoch": 5.277044854881266, + "grad_norm": 0.8458422422409058, + "learning_rate": 0.0002, + "loss": 0.5192, + "step": 4000 + }, + { + "epoch": 5.2902374670184695, + "grad_norm": 0.908256471157074, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 4010 + }, + { + "epoch": 5.303430079155673, + "grad_norm": 1.0058149099349976, + "learning_rate": 0.0002, + "loss": 0.5102, + "step": 4020 + }, + { + "epoch": 5.316622691292876, + "grad_norm": 1.20364511013031, + "learning_rate": 0.0002, + "loss": 0.5322, + "step": 4030 + }, + { + "epoch": 5.329815303430079, + "grad_norm": 1.0135732889175415, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 4040 + }, + { + "epoch": 5.3430079155672825, + "grad_norm": 1.1094907522201538, + "learning_rate": 0.0002, + "loss": 0.4736, + "step": 4050 + }, + { + "epoch": 5.356200527704486, + "grad_norm": 1.0373083353042603, + "learning_rate": 0.0002, + "loss": 0.4912, + "step": 4060 + }, + { + "epoch": 5.369393139841689, + "grad_norm": 1.0952966213226318, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 4070 + }, + { + "epoch": 5.382585751978892, + "grad_norm": 1.1734952926635742, + "learning_rate": 0.0002, + "loss": 0.4892, + "step": 4080 + }, + { + "epoch": 5.395778364116095, + "grad_norm": 0.8217245936393738, + "learning_rate": 0.0002, + "loss": 0.4463, + "step": 4090 + }, + { + "epoch": 5.408970976253298, + "grad_norm": 1.0936307907104492, + "learning_rate": 0.0002, + "loss": 0.5271, + "step": 4100 + }, + { + "epoch": 5.422163588390501, + "grad_norm": 1.0198720693588257, + "learning_rate": 0.0002, + "loss": 0.509, + "step": 4110 + }, + { + "epoch": 5.435356200527704, + "grad_norm": 1.1105809211730957, + "learning_rate": 0.0002, + "loss": 0.5265, + "step": 4120 + }, + { + "epoch": 5.4485488126649075, + "grad_norm": 1.1817213296890259, + "learning_rate": 0.0002, + "loss": 0.4871, + "step": 4130 + }, + { + "epoch": 5.461741424802111, + "grad_norm": 1.126339077949524, + "learning_rate": 0.0002, + "loss": 0.4987, + "step": 4140 + }, + { + "epoch": 5.474934036939314, + "grad_norm": 0.9467914700508118, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 4150 + }, + { + "epoch": 5.488126649076517, + "grad_norm": 1.0335774421691895, + "learning_rate": 0.0002, + "loss": 0.5386, + "step": 4160 + }, + { + "epoch": 5.5013192612137205, + "grad_norm": 0.866211473941803, + "learning_rate": 0.0002, + "loss": 0.5122, + "step": 4170 + }, + { + "epoch": 5.514511873350924, + "grad_norm": 0.7422948479652405, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 4180 + }, + { + "epoch": 5.527704485488127, + "grad_norm": 1.2211135625839233, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 4190 + }, + { + "epoch": 5.540897097625329, + "grad_norm": 1.0371766090393066, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 4200 + }, + { + "epoch": 5.554089709762533, + "grad_norm": 0.9460630416870117, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 4210 + }, + { + "epoch": 5.567282321899736, + "grad_norm": 0.7972197532653809, + "learning_rate": 0.0002, + "loss": 0.4645, + "step": 4220 + }, + { + "epoch": 5.580474934036939, + "grad_norm": 1.0654675960540771, + "learning_rate": 0.0002, + "loss": 0.5087, + "step": 4230 + }, + { + "epoch": 5.593667546174142, + "grad_norm": 1.0776735544204712, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 4240 + }, + { + "epoch": 5.6068601583113455, + "grad_norm": 1.498723030090332, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 4250 + }, + { + "epoch": 5.620052770448549, + "grad_norm": 1.006768822669983, + "learning_rate": 0.0002, + "loss": 0.4788, + "step": 4260 + }, + { + "epoch": 5.633245382585752, + "grad_norm": 0.9194242358207703, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 4270 + }, + { + "epoch": 5.646437994722955, + "grad_norm": 1.1028380393981934, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 4280 + }, + { + "epoch": 5.6596306068601585, + "grad_norm": 0.9972755312919617, + "learning_rate": 0.0002, + "loss": 0.5319, + "step": 4290 + }, + { + "epoch": 5.672823218997362, + "grad_norm": 1.0509438514709473, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 4300 + }, + { + "epoch": 5.686015831134565, + "grad_norm": 1.064039945602417, + "learning_rate": 0.0002, + "loss": 0.4738, + "step": 4310 + }, + { + "epoch": 5.699208443271768, + "grad_norm": 0.9572229981422424, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 4320 + }, + { + "epoch": 5.7124010554089715, + "grad_norm": 0.9956564903259277, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 4330 + }, + { + "epoch": 5.725593667546174, + "grad_norm": 1.01974618434906, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 4340 + }, + { + "epoch": 5.738786279683377, + "grad_norm": 1.101328730583191, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 4350 + }, + { + "epoch": 5.75197889182058, + "grad_norm": 0.9971756935119629, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 4360 + }, + { + "epoch": 5.7651715039577835, + "grad_norm": 0.8579474687576294, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 4370 + }, + { + "epoch": 5.778364116094987, + "grad_norm": 0.9927367568016052, + "learning_rate": 0.0002, + "loss": 0.5106, + "step": 4380 + }, + { + "epoch": 5.79155672823219, + "grad_norm": 1.1183884143829346, + "learning_rate": 0.0002, + "loss": 0.5354, + "step": 4390 + }, + { + "epoch": 5.804749340369393, + "grad_norm": 0.7695905566215515, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 4400 + }, + { + "epoch": 5.8179419525065965, + "grad_norm": 1.1102122068405151, + "learning_rate": 0.0002, + "loss": 0.5137, + "step": 4410 + }, + { + "epoch": 5.8311345646438, + "grad_norm": 1.3201336860656738, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 4420 + }, + { + "epoch": 5.844327176781003, + "grad_norm": 1.1934558153152466, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 4430 + }, + { + "epoch": 5.857519788918205, + "grad_norm": 1.390870451927185, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 4440 + }, + { + "epoch": 5.870712401055409, + "grad_norm": 1.056314468383789, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 4450 + }, + { + "epoch": 5.883905013192612, + "grad_norm": 0.9797437191009521, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 4460 + }, + { + "epoch": 5.897097625329815, + "grad_norm": 1.2368146181106567, + "learning_rate": 0.0002, + "loss": 0.5479, + "step": 4470 + }, + { + "epoch": 5.910290237467018, + "grad_norm": 0.9062654376029968, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 4480 + }, + { + "epoch": 5.923482849604222, + "grad_norm": 1.8643536567687988, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 4490 + }, + { + "epoch": 5.936675461741425, + "grad_norm": 1.2977997064590454, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 4500 + }, + { + "epoch": 5.949868073878628, + "grad_norm": 0.8366201519966125, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 4510 + }, + { + "epoch": 5.963060686015831, + "grad_norm": 1.0210131406784058, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 4520 + }, + { + "epoch": 5.9762532981530345, + "grad_norm": 1.1287827491760254, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 4530 + }, + { + "epoch": 5.989445910290238, + "grad_norm": 1.0480493307113647, + "learning_rate": 0.0002, + "loss": 0.5053, + "step": 4540 + }, + { + "epoch": 6.0, + "eval_loss": 1.450880765914917, + "eval_runtime": 71.8135, + "eval_samples_per_second": 6.002, + "eval_steps_per_second": 0.752, + "step": 4548 + } + ], + "logging_steps": 10, + "max_steps": 6064, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.1047125661555098e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0c0289389bcb2236bf804b0a82c50e67c11a1379 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-4548/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e04722e246164eebe6b4f70dd50fe946c936a2bd25edd75bcedcfd4c7ff9474e +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c8d439b8c074798183ca453cf105a0837763e2a7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4819614035f07fe9cb7e807747604f03e22f0307487f5b50d692667a14e36c83 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ecf3f5a90a2987206e0ce7be497718fb64a41bfc --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dece4997d320bcae9cd149513e3789163bc77c0c531c0ae26d71e1a22fd1dac +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d239d09fba84fa8b275199ecfa9e23422ab04a6b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af518a007809b768a453426e72cb24ac9c30e553b567aa221bd1f2e0707175f6 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2aff7039cc13b932a352ab94424dfbe66430f0a1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:767dcfb7854b7cc6f100471fb3d42b062a1326364963fa8473a8c6c1d770659f +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..66c5f05d405455bbfde9c093c281189d551ad78d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/trainer_state.json @@ -0,0 +1,3799 @@ +{ + "best_metric": 1.1874967813491821, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516", + "epoch": 7.0, + "eval_steps": 10, + "global_step": 5306, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013192612137203167, + "grad_norm": 0.7545632123947144, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 10 + }, + { + "epoch": 0.026385224274406333, + "grad_norm": 0.5787661075592041, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 20 + }, + { + "epoch": 0.0395778364116095, + "grad_norm": 0.8616093993186951, + "learning_rate": 0.0002, + "loss": 1.3906, + "step": 30 + }, + { + "epoch": 0.052770448548812667, + "grad_norm": 0.42088547348976135, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 40 + }, + { + "epoch": 0.06596306068601583, + "grad_norm": 0.47704678773880005, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 50 + }, + { + "epoch": 0.079155672823219, + "grad_norm": 0.5763994455337524, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 60 + }, + { + "epoch": 0.09234828496042216, + "grad_norm": 0.4579846262931824, + "learning_rate": 0.0002, + "loss": 1.2268, + "step": 70 + }, + { + "epoch": 0.10554089709762533, + "grad_norm": 0.46623846888542175, + "learning_rate": 0.0002, + "loss": 1.415, + "step": 80 + }, + { + "epoch": 0.11873350923482849, + "grad_norm": 0.4206956624984741, + "learning_rate": 0.0002, + "loss": 1.2849, + "step": 90 + }, + { + "epoch": 0.13192612137203166, + "grad_norm": 0.41896629333496094, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 100 + }, + { + "epoch": 0.14511873350923482, + "grad_norm": 0.3459089398384094, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 110 + }, + { + "epoch": 0.158311345646438, + "grad_norm": 0.4587327837944031, + "learning_rate": 0.0002, + "loss": 1.2858, + "step": 120 + }, + { + "epoch": 0.17150395778364116, + "grad_norm": 0.433525413274765, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 130 + }, + { + "epoch": 0.18469656992084432, + "grad_norm": 0.39253175258636475, + "learning_rate": 0.0002, + "loss": 1.2269, + "step": 140 + }, + { + "epoch": 0.19788918205804748, + "grad_norm": 0.3602290749549866, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 150 + }, + { + "epoch": 0.21108179419525067, + "grad_norm": 0.41160839796066284, + "learning_rate": 0.0002, + "loss": 1.2834, + "step": 160 + }, + { + "epoch": 0.22427440633245382, + "grad_norm": 0.7213630080223083, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 170 + }, + { + "epoch": 0.23746701846965698, + "grad_norm": 0.39086055755615234, + "learning_rate": 0.0002, + "loss": 1.2215, + "step": 180 + }, + { + "epoch": 0.25065963060686014, + "grad_norm": 0.4465520977973938, + "learning_rate": 0.0002, + "loss": 1.3315, + "step": 190 + }, + { + "epoch": 0.2638522427440633, + "grad_norm": 1.814679741859436, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 200 + }, + { + "epoch": 0.2770448548812665, + "grad_norm": 0.5026423931121826, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 210 + }, + { + "epoch": 0.29023746701846964, + "grad_norm": 0.4156292974948883, + "learning_rate": 0.0002, + "loss": 1.3306, + "step": 220 + }, + { + "epoch": 0.3034300791556728, + "grad_norm": 0.40813493728637695, + "learning_rate": 0.0002, + "loss": 1.266, + "step": 230 + }, + { + "epoch": 0.316622691292876, + "grad_norm": 0.3304787874221802, + "learning_rate": 0.0002, + "loss": 1.1533, + "step": 240 + }, + { + "epoch": 0.32981530343007914, + "grad_norm": 0.46139976382255554, + "learning_rate": 0.0002, + "loss": 1.3154, + "step": 250 + }, + { + "epoch": 0.34300791556728233, + "grad_norm": 0.37518271803855896, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 260 + }, + { + "epoch": 0.3562005277044855, + "grad_norm": 0.35586467385292053, + "learning_rate": 0.0002, + "loss": 1.2004, + "step": 270 + }, + { + "epoch": 0.36939313984168864, + "grad_norm": 0.32441186904907227, + "learning_rate": 0.0002, + "loss": 1.2609, + "step": 280 + }, + { + "epoch": 0.38258575197889183, + "grad_norm": 0.3198683261871338, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 290 + }, + { + "epoch": 0.39577836411609496, + "grad_norm": 0.33663108944892883, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 300 + }, + { + "epoch": 0.40897097625329815, + "grad_norm": 0.3711244761943817, + "learning_rate": 0.0002, + "loss": 1.1702, + "step": 310 + }, + { + "epoch": 0.42216358839050133, + "grad_norm": 0.3209651708602905, + "learning_rate": 0.0002, + "loss": 1.0871, + "step": 320 + }, + { + "epoch": 0.43535620052770446, + "grad_norm": 0.5152716040611267, + "learning_rate": 0.0002, + "loss": 1.2728, + "step": 330 + }, + { + "epoch": 0.44854881266490765, + "grad_norm": 0.5431376695632935, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 340 + }, + { + "epoch": 0.46174142480211083, + "grad_norm": 0.3069997727870941, + "learning_rate": 0.0002, + "loss": 1.1344, + "step": 350 + }, + { + "epoch": 0.47493403693931396, + "grad_norm": 0.34260064363479614, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 360 + }, + { + "epoch": 0.48812664907651715, + "grad_norm": 0.345653235912323, + "learning_rate": 0.0002, + "loss": 1.3161, + "step": 370 + }, + { + "epoch": 0.5013192612137203, + "grad_norm": 0.46222734451293945, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 380 + }, + { + "epoch": 0.5145118733509235, + "grad_norm": 0.27301734685897827, + "learning_rate": 0.0002, + "loss": 1.0616, + "step": 390 + }, + { + "epoch": 0.5277044854881267, + "grad_norm": 0.29048439860343933, + "learning_rate": 0.0002, + "loss": 1.1944, + "step": 400 + }, + { + "epoch": 0.5408970976253298, + "grad_norm": 0.32927802205085754, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 410 + }, + { + "epoch": 0.554089709762533, + "grad_norm": 0.3336397409439087, + "learning_rate": 0.0002, + "loss": 1.2362, + "step": 420 + }, + { + "epoch": 0.5672823218997362, + "grad_norm": 0.4007597267627716, + "learning_rate": 0.0002, + "loss": 1.1445, + "step": 430 + }, + { + "epoch": 0.5804749340369393, + "grad_norm": 0.36144956946372986, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 440 + }, + { + "epoch": 0.5936675461741425, + "grad_norm": 0.6331009864807129, + "learning_rate": 0.0002, + "loss": 1.1604, + "step": 450 + }, + { + "epoch": 0.6068601583113457, + "grad_norm": 0.41469088196754456, + "learning_rate": 0.0002, + "loss": 1.1692, + "step": 460 + }, + { + "epoch": 0.6200527704485488, + "grad_norm": 0.4388185143470764, + "learning_rate": 0.0002, + "loss": 1.2372, + "step": 470 + }, + { + "epoch": 0.633245382585752, + "grad_norm": 0.3738141655921936, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 480 + }, + { + "epoch": 0.6464379947229552, + "grad_norm": 0.7212023138999939, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 490 + }, + { + "epoch": 0.6596306068601583, + "grad_norm": 0.2972351014614105, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 500 + }, + { + "epoch": 0.6728232189973615, + "grad_norm": 0.45293179154396057, + "learning_rate": 0.0002, + "loss": 1.1739, + "step": 510 + }, + { + "epoch": 0.6860158311345647, + "grad_norm": 0.4319860637187958, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 520 + }, + { + "epoch": 0.6992084432717678, + "grad_norm": 0.3050215542316437, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 530 + }, + { + "epoch": 0.712401055408971, + "grad_norm": 0.3552611172199249, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 540 + }, + { + "epoch": 0.7255936675461742, + "grad_norm": 0.3631151020526886, + "learning_rate": 0.0002, + "loss": 1.2546, + "step": 550 + }, + { + "epoch": 0.7387862796833773, + "grad_norm": 0.28177931904792786, + "learning_rate": 0.0002, + "loss": 1.1371, + "step": 560 + }, + { + "epoch": 0.7519788918205804, + "grad_norm": 0.359764039516449, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 570 + }, + { + "epoch": 0.7651715039577837, + "grad_norm": 0.3970327377319336, + "learning_rate": 0.0002, + "loss": 1.1356, + "step": 580 + }, + { + "epoch": 0.7783641160949868, + "grad_norm": 0.3541001081466675, + "learning_rate": 0.0002, + "loss": 1.309, + "step": 590 + }, + { + "epoch": 0.7915567282321899, + "grad_norm": 0.3478573262691498, + "learning_rate": 0.0002, + "loss": 1.2061, + "step": 600 + }, + { + "epoch": 0.8047493403693932, + "grad_norm": 0.3900321424007416, + "learning_rate": 0.0002, + "loss": 1.1864, + "step": 610 + }, + { + "epoch": 0.8179419525065963, + "grad_norm": 0.3443238437175751, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 620 + }, + { + "epoch": 0.8311345646437994, + "grad_norm": 0.44238781929016113, + "learning_rate": 0.0002, + "loss": 1.3232, + "step": 630 + }, + { + "epoch": 0.8443271767810027, + "grad_norm": 0.36339467763900757, + "learning_rate": 0.0002, + "loss": 1.1247, + "step": 640 + }, + { + "epoch": 0.8575197889182058, + "grad_norm": 0.6243070363998413, + "learning_rate": 0.0002, + "loss": 1.1049, + "step": 650 + }, + { + "epoch": 0.8707124010554089, + "grad_norm": 0.3209173381328583, + "learning_rate": 0.0002, + "loss": 1.1943, + "step": 660 + }, + { + "epoch": 0.8839050131926122, + "grad_norm": 0.35017991065979004, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 670 + }, + { + "epoch": 0.8970976253298153, + "grad_norm": 0.3247159421443939, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 680 + }, + { + "epoch": 0.9102902374670184, + "grad_norm": 0.4091894030570984, + "learning_rate": 0.0002, + "loss": 1.2298, + "step": 690 + }, + { + "epoch": 0.9234828496042217, + "grad_norm": 0.3975585997104645, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 700 + }, + { + "epoch": 0.9366754617414248, + "grad_norm": 0.3666245937347412, + "learning_rate": 0.0002, + "loss": 1.281, + "step": 710 + }, + { + "epoch": 0.9498680738786279, + "grad_norm": 0.45216917991638184, + "learning_rate": 0.0002, + "loss": 1.2855, + "step": 720 + }, + { + "epoch": 0.9630606860158312, + "grad_norm": 0.36108118295669556, + "learning_rate": 0.0002, + "loss": 1.2217, + "step": 730 + }, + { + "epoch": 0.9762532981530343, + "grad_norm": 0.44550251960754395, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 740 + }, + { + "epoch": 0.9894459102902374, + "grad_norm": 0.29801255464553833, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 750 + }, + { + "epoch": 1.0, + "eval_loss": 1.203244686126709, + "eval_runtime": 76.0457, + "eval_samples_per_second": 5.668, + "eval_steps_per_second": 0.71, + "step": 758 + }, + { + "epoch": 1.0026385224274406, + "grad_norm": 0.4096551239490509, + "learning_rate": 0.0002, + "loss": 1.0748, + "step": 760 + }, + { + "epoch": 1.0158311345646438, + "grad_norm": 0.2649582326412201, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 770 + }, + { + "epoch": 1.029023746701847, + "grad_norm": 0.3100722134113312, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 780 + }, + { + "epoch": 1.04221635883905, + "grad_norm": 0.3911755383014679, + "learning_rate": 0.0002, + "loss": 1.1736, + "step": 790 + }, + { + "epoch": 1.0554089709762533, + "grad_norm": 0.4600953757762909, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 800 + }, + { + "epoch": 1.0686015831134565, + "grad_norm": 0.28671619296073914, + "learning_rate": 0.0002, + "loss": 1.2869, + "step": 810 + }, + { + "epoch": 1.0817941952506596, + "grad_norm": 0.47282642126083374, + "learning_rate": 0.0002, + "loss": 1.1441, + "step": 820 + }, + { + "epoch": 1.0949868073878628, + "grad_norm": 0.690073549747467, + "learning_rate": 0.0002, + "loss": 1.1368, + "step": 830 + }, + { + "epoch": 1.108179419525066, + "grad_norm": 0.7317902445793152, + "learning_rate": 0.0002, + "loss": 0.9944, + "step": 840 + }, + { + "epoch": 1.121372031662269, + "grad_norm": 0.44215938448905945, + "learning_rate": 0.0002, + "loss": 1.108, + "step": 850 + }, + { + "epoch": 1.1345646437994723, + "grad_norm": 0.33875149488449097, + "learning_rate": 0.0002, + "loss": 1.0558, + "step": 860 + }, + { + "epoch": 1.1477572559366755, + "grad_norm": 0.3700002431869507, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 870 + }, + { + "epoch": 1.1609498680738786, + "grad_norm": 0.41173291206359863, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 880 + }, + { + "epoch": 1.1741424802110818, + "grad_norm": 0.5253589749336243, + "learning_rate": 0.0002, + "loss": 1.0312, + "step": 890 + }, + { + "epoch": 1.187335092348285, + "grad_norm": 0.3912237286567688, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 900 + }, + { + "epoch": 1.200527704485488, + "grad_norm": 0.40990331768989563, + "learning_rate": 0.0002, + "loss": 0.9434, + "step": 910 + }, + { + "epoch": 1.2137203166226913, + "grad_norm": 0.40377968549728394, + "learning_rate": 0.0002, + "loss": 1.0493, + "step": 920 + }, + { + "epoch": 1.2269129287598945, + "grad_norm": 0.4605846405029297, + "learning_rate": 0.0002, + "loss": 1.1496, + "step": 930 + }, + { + "epoch": 1.2401055408970976, + "grad_norm": 0.31564897298812866, + "learning_rate": 0.0002, + "loss": 1.1173, + "step": 940 + }, + { + "epoch": 1.2532981530343008, + "grad_norm": 0.39808550477027893, + "learning_rate": 0.0002, + "loss": 1.0547, + "step": 950 + }, + { + "epoch": 1.266490765171504, + "grad_norm": 0.3762115240097046, + "learning_rate": 0.0002, + "loss": 1.1367, + "step": 960 + }, + { + "epoch": 1.279683377308707, + "grad_norm": 0.4174984097480774, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 970 + }, + { + "epoch": 1.2928759894459103, + "grad_norm": 0.5263054966926575, + "learning_rate": 0.0002, + "loss": 1.1327, + "step": 980 + }, + { + "epoch": 1.3060686015831133, + "grad_norm": 0.41673699021339417, + "learning_rate": 0.0002, + "loss": 1.0339, + "step": 990 + }, + { + "epoch": 1.3192612137203166, + "grad_norm": 0.9613684415817261, + "learning_rate": 0.0002, + "loss": 1.1198, + "step": 1000 + }, + { + "epoch": 1.3324538258575198, + "grad_norm": 0.3690216839313507, + "learning_rate": 0.0002, + "loss": 1.0444, + "step": 1010 + }, + { + "epoch": 1.345646437994723, + "grad_norm": 0.521821141242981, + "learning_rate": 0.0002, + "loss": 1.0473, + "step": 1020 + }, + { + "epoch": 1.358839050131926, + "grad_norm": 0.3353094160556793, + "learning_rate": 0.0002, + "loss": 1.1065, + "step": 1030 + }, + { + "epoch": 1.3720316622691293, + "grad_norm": 0.3843843936920166, + "learning_rate": 0.0002, + "loss": 1.1286, + "step": 1040 + }, + { + "epoch": 1.3852242744063323, + "grad_norm": 0.372514545917511, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 1050 + }, + { + "epoch": 1.3984168865435356, + "grad_norm": 0.34537771344184875, + "learning_rate": 0.0002, + "loss": 1.0041, + "step": 1060 + }, + { + "epoch": 1.4116094986807388, + "grad_norm": 0.45349085330963135, + "learning_rate": 0.0002, + "loss": 1.0251, + "step": 1070 + }, + { + "epoch": 1.424802110817942, + "grad_norm": 0.5120177268981934, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 1080 + }, + { + "epoch": 1.437994722955145, + "grad_norm": 0.42800238728523254, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 1090 + }, + { + "epoch": 1.4511873350923483, + "grad_norm": 0.343832790851593, + "learning_rate": 0.0002, + "loss": 1.0734, + "step": 1100 + }, + { + "epoch": 1.4643799472295513, + "grad_norm": 0.3829841911792755, + "learning_rate": 0.0002, + "loss": 1.1286, + "step": 1110 + }, + { + "epoch": 1.4775725593667546, + "grad_norm": 0.4289931058883667, + "learning_rate": 0.0002, + "loss": 1.1268, + "step": 1120 + }, + { + "epoch": 1.4907651715039578, + "grad_norm": 0.42750850319862366, + "learning_rate": 0.0002, + "loss": 1.0676, + "step": 1130 + }, + { + "epoch": 1.503957783641161, + "grad_norm": 0.34328413009643555, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 1140 + }, + { + "epoch": 1.517150395778364, + "grad_norm": 0.349096417427063, + "learning_rate": 0.0002, + "loss": 1.0863, + "step": 1150 + }, + { + "epoch": 1.5303430079155673, + "grad_norm": 0.7700717449188232, + "learning_rate": 0.0002, + "loss": 1.1462, + "step": 1160 + }, + { + "epoch": 1.5435356200527703, + "grad_norm": 0.39294949173927307, + "learning_rate": 0.0002, + "loss": 0.9528, + "step": 1170 + }, + { + "epoch": 1.5567282321899736, + "grad_norm": 0.36173608899116516, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 1180 + }, + { + "epoch": 1.5699208443271768, + "grad_norm": 0.6034277677536011, + "learning_rate": 0.0002, + "loss": 1.1508, + "step": 1190 + }, + { + "epoch": 1.58311345646438, + "grad_norm": 0.36694103479385376, + "learning_rate": 0.0002, + "loss": 1.0105, + "step": 1200 + }, + { + "epoch": 1.596306068601583, + "grad_norm": 0.4727209508419037, + "learning_rate": 0.0002, + "loss": 1.1479, + "step": 1210 + }, + { + "epoch": 1.6094986807387863, + "grad_norm": 0.6482883095741272, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 1220 + }, + { + "epoch": 1.6226912928759893, + "grad_norm": 0.5238035917282104, + "learning_rate": 0.0002, + "loss": 1.1405, + "step": 1230 + }, + { + "epoch": 1.6358839050131926, + "grad_norm": 0.4812222421169281, + "learning_rate": 0.0002, + "loss": 1.0596, + "step": 1240 + }, + { + "epoch": 1.6490765171503958, + "grad_norm": 0.7131702303886414, + "learning_rate": 0.0002, + "loss": 1.2729, + "step": 1250 + }, + { + "epoch": 1.662269129287599, + "grad_norm": 0.3803327977657318, + "learning_rate": 0.0002, + "loss": 0.9832, + "step": 1260 + }, + { + "epoch": 1.675461741424802, + "grad_norm": 0.3745088577270508, + "learning_rate": 0.0002, + "loss": 1.1433, + "step": 1270 + }, + { + "epoch": 1.6886543535620053, + "grad_norm": 0.4427378475666046, + "learning_rate": 0.0002, + "loss": 1.1018, + "step": 1280 + }, + { + "epoch": 1.7018469656992083, + "grad_norm": 0.797478973865509, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 1290 + }, + { + "epoch": 1.7150395778364116, + "grad_norm": 0.503620982170105, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 1300 + }, + { + "epoch": 1.7282321899736148, + "grad_norm": 0.4132426381111145, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 1310 + }, + { + "epoch": 1.741424802110818, + "grad_norm": 0.41811656951904297, + "learning_rate": 0.0002, + "loss": 1.172, + "step": 1320 + }, + { + "epoch": 1.754617414248021, + "grad_norm": 0.40647849440574646, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1330 + }, + { + "epoch": 1.767810026385224, + "grad_norm": 0.42138347029685974, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 1340 + }, + { + "epoch": 1.7810026385224274, + "grad_norm": 0.46523579955101013, + "learning_rate": 0.0002, + "loss": 0.9955, + "step": 1350 + }, + { + "epoch": 1.7941952506596306, + "grad_norm": 0.39760419726371765, + "learning_rate": 0.0002, + "loss": 1.0307, + "step": 1360 + }, + { + "epoch": 1.8073878627968338, + "grad_norm": 0.37993717193603516, + "learning_rate": 0.0002, + "loss": 1.1218, + "step": 1370 + }, + { + "epoch": 1.820580474934037, + "grad_norm": 0.5404181480407715, + "learning_rate": 0.0002, + "loss": 1.0921, + "step": 1380 + }, + { + "epoch": 1.83377308707124, + "grad_norm": 0.4385245740413666, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1390 + }, + { + "epoch": 1.8469656992084431, + "grad_norm": 0.529797375202179, + "learning_rate": 0.0002, + "loss": 1.1175, + "step": 1400 + }, + { + "epoch": 1.8601583113456464, + "grad_norm": 0.481567919254303, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 1410 + }, + { + "epoch": 1.8733509234828496, + "grad_norm": 0.34787362813949585, + "learning_rate": 0.0002, + "loss": 1.0581, + "step": 1420 + }, + { + "epoch": 1.8865435356200528, + "grad_norm": 0.6402362585067749, + "learning_rate": 0.0002, + "loss": 1.0536, + "step": 1430 + }, + { + "epoch": 1.899736147757256, + "grad_norm": 0.3461322784423828, + "learning_rate": 0.0002, + "loss": 1.0787, + "step": 1440 + }, + { + "epoch": 1.912928759894459, + "grad_norm": 0.44005653262138367, + "learning_rate": 0.0002, + "loss": 1.0925, + "step": 1450 + }, + { + "epoch": 1.9261213720316621, + "grad_norm": 0.4064280688762665, + "learning_rate": 0.0002, + "loss": 1.0414, + "step": 1460 + }, + { + "epoch": 1.9393139841688654, + "grad_norm": 0.5236523151397705, + "learning_rate": 0.0002, + "loss": 1.0608, + "step": 1470 + }, + { + "epoch": 1.9525065963060686, + "grad_norm": 0.41030219197273254, + "learning_rate": 0.0002, + "loss": 1.0572, + "step": 1480 + }, + { + "epoch": 1.9656992084432718, + "grad_norm": 0.39805835485458374, + "learning_rate": 0.0002, + "loss": 1.1204, + "step": 1490 + }, + { + "epoch": 1.978891820580475, + "grad_norm": 0.42974501848220825, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 1500 + }, + { + "epoch": 1.992084432717678, + "grad_norm": 0.4688243865966797, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 1510 + }, + { + "epoch": 2.0, + "eval_loss": 1.1874967813491821, + "eval_runtime": 71.9523, + "eval_samples_per_second": 5.99, + "eval_steps_per_second": 0.75, + "step": 1516 + }, + { + "epoch": 2.005277044854881, + "grad_norm": 0.4121631383895874, + "learning_rate": 0.0002, + "loss": 1.0401, + "step": 1520 + }, + { + "epoch": 2.0184696569920844, + "grad_norm": 0.4844197928905487, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 1530 + }, + { + "epoch": 2.0316622691292876, + "grad_norm": 0.45408546924591064, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 1540 + }, + { + "epoch": 2.044854881266491, + "grad_norm": 0.48662951588630676, + "learning_rate": 0.0002, + "loss": 1.0399, + "step": 1550 + }, + { + "epoch": 2.058047493403694, + "grad_norm": 0.7195899486541748, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 1560 + }, + { + "epoch": 2.0712401055408973, + "grad_norm": 0.5071077346801758, + "learning_rate": 0.0002, + "loss": 0.9486, + "step": 1570 + }, + { + "epoch": 2.0844327176781, + "grad_norm": 0.7473958730697632, + "learning_rate": 0.0002, + "loss": 1.0055, + "step": 1580 + }, + { + "epoch": 2.0976253298153034, + "grad_norm": 0.5509232878684998, + "learning_rate": 0.0002, + "loss": 0.8309, + "step": 1590 + }, + { + "epoch": 2.1108179419525066, + "grad_norm": 0.5108042359352112, + "learning_rate": 0.0002, + "loss": 0.9181, + "step": 1600 + }, + { + "epoch": 2.12401055408971, + "grad_norm": 0.42331448197364807, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 1610 + }, + { + "epoch": 2.137203166226913, + "grad_norm": 0.46621623635292053, + "learning_rate": 0.0002, + "loss": 0.9359, + "step": 1620 + }, + { + "epoch": 2.150395778364116, + "grad_norm": 0.43802836537361145, + "learning_rate": 0.0002, + "loss": 0.9065, + "step": 1630 + }, + { + "epoch": 2.163588390501319, + "grad_norm": 0.49908021092414856, + "learning_rate": 0.0002, + "loss": 0.9375, + "step": 1640 + }, + { + "epoch": 2.1767810026385224, + "grad_norm": 0.4195636808872223, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 1650 + }, + { + "epoch": 2.1899736147757256, + "grad_norm": 0.49515822529792786, + "learning_rate": 0.0002, + "loss": 0.8501, + "step": 1660 + }, + { + "epoch": 2.203166226912929, + "grad_norm": 0.4607589542865753, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 1670 + }, + { + "epoch": 2.216358839050132, + "grad_norm": 0.4489196836948395, + "learning_rate": 0.0002, + "loss": 1.0206, + "step": 1680 + }, + { + "epoch": 2.229551451187335, + "grad_norm": 0.49300864338874817, + "learning_rate": 0.0002, + "loss": 0.9657, + "step": 1690 + }, + { + "epoch": 2.242744063324538, + "grad_norm": 0.6624954342842102, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 1700 + }, + { + "epoch": 2.2559366754617414, + "grad_norm": 0.8391500115394592, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 1710 + }, + { + "epoch": 2.2691292875989446, + "grad_norm": 0.5193073749542236, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 1720 + }, + { + "epoch": 2.282321899736148, + "grad_norm": 0.6180613040924072, + "learning_rate": 0.0002, + "loss": 0.9979, + "step": 1730 + }, + { + "epoch": 2.295514511873351, + "grad_norm": 0.591191291809082, + "learning_rate": 0.0002, + "loss": 0.9579, + "step": 1740 + }, + { + "epoch": 2.308707124010554, + "grad_norm": 0.546897828578949, + "learning_rate": 0.0002, + "loss": 0.8879, + "step": 1750 + }, + { + "epoch": 2.321899736147757, + "grad_norm": 0.5470401644706726, + "learning_rate": 0.0002, + "loss": 0.9321, + "step": 1760 + }, + { + "epoch": 2.3350923482849604, + "grad_norm": 0.4590282738208771, + "learning_rate": 0.0002, + "loss": 0.9104, + "step": 1770 + }, + { + "epoch": 2.3482849604221636, + "grad_norm": 0.622164785861969, + "learning_rate": 0.0002, + "loss": 1.0384, + "step": 1780 + }, + { + "epoch": 2.361477572559367, + "grad_norm": 0.5753812193870544, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 1790 + }, + { + "epoch": 2.37467018469657, + "grad_norm": 0.47958624362945557, + "learning_rate": 0.0002, + "loss": 0.9569, + "step": 1800 + }, + { + "epoch": 2.387862796833773, + "grad_norm": 0.48042672872543335, + "learning_rate": 0.0002, + "loss": 1.0519, + "step": 1810 + }, + { + "epoch": 2.401055408970976, + "grad_norm": 0.44586366415023804, + "learning_rate": 0.0002, + "loss": 0.9042, + "step": 1820 + }, + { + "epoch": 2.4142480211081794, + "grad_norm": 0.7239416837692261, + "learning_rate": 0.0002, + "loss": 0.9783, + "step": 1830 + }, + { + "epoch": 2.4274406332453826, + "grad_norm": 0.5515341758728027, + "learning_rate": 0.0002, + "loss": 0.8818, + "step": 1840 + }, + { + "epoch": 2.440633245382586, + "grad_norm": 0.6280064582824707, + "learning_rate": 0.0002, + "loss": 0.9503, + "step": 1850 + }, + { + "epoch": 2.453825857519789, + "grad_norm": 0.4832057058811188, + "learning_rate": 0.0002, + "loss": 0.8943, + "step": 1860 + }, + { + "epoch": 2.467018469656992, + "grad_norm": 0.5789321064949036, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 1870 + }, + { + "epoch": 2.480211081794195, + "grad_norm": 0.48491886258125305, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 1880 + }, + { + "epoch": 2.4934036939313984, + "grad_norm": 0.532365620136261, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 1890 + }, + { + "epoch": 2.5065963060686016, + "grad_norm": 0.7087852954864502, + "learning_rate": 0.0002, + "loss": 0.9734, + "step": 1900 + }, + { + "epoch": 2.519788918205805, + "grad_norm": 0.48157402873039246, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 1910 + }, + { + "epoch": 2.532981530343008, + "grad_norm": 0.5886041522026062, + "learning_rate": 0.0002, + "loss": 0.9851, + "step": 1920 + }, + { + "epoch": 2.5461741424802113, + "grad_norm": 0.6332622766494751, + "learning_rate": 0.0002, + "loss": 1.0144, + "step": 1930 + }, + { + "epoch": 2.559366754617414, + "grad_norm": 0.5463117957115173, + "learning_rate": 0.0002, + "loss": 0.9516, + "step": 1940 + }, + { + "epoch": 2.5725593667546174, + "grad_norm": 0.5432228446006775, + "learning_rate": 0.0002, + "loss": 0.9373, + "step": 1950 + }, + { + "epoch": 2.5857519788918206, + "grad_norm": 0.5929186940193176, + "learning_rate": 0.0002, + "loss": 0.8974, + "step": 1960 + }, + { + "epoch": 2.598944591029024, + "grad_norm": 0.5120641589164734, + "learning_rate": 0.0002, + "loss": 1.0062, + "step": 1970 + }, + { + "epoch": 2.6121372031662267, + "grad_norm": 0.5372339487075806, + "learning_rate": 0.0002, + "loss": 0.9143, + "step": 1980 + }, + { + "epoch": 2.62532981530343, + "grad_norm": 0.5519838929176331, + "learning_rate": 0.0002, + "loss": 1.0786, + "step": 1990 + }, + { + "epoch": 2.638522427440633, + "grad_norm": 0.7304037809371948, + "learning_rate": 0.0002, + "loss": 1.021, + "step": 2000 + }, + { + "epoch": 2.6517150395778364, + "grad_norm": 0.6182340979576111, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 2010 + }, + { + "epoch": 2.6649076517150396, + "grad_norm": 0.4874444305896759, + "learning_rate": 0.0002, + "loss": 1.0501, + "step": 2020 + }, + { + "epoch": 2.678100263852243, + "grad_norm": 0.5850239396095276, + "learning_rate": 0.0002, + "loss": 0.9612, + "step": 2030 + }, + { + "epoch": 2.691292875989446, + "grad_norm": 0.6495311856269836, + "learning_rate": 0.0002, + "loss": 0.9808, + "step": 2040 + }, + { + "epoch": 2.7044854881266494, + "grad_norm": 1.002830147743225, + "learning_rate": 0.0002, + "loss": 1.0187, + "step": 2050 + }, + { + "epoch": 2.717678100263852, + "grad_norm": 0.49076753854751587, + "learning_rate": 0.0002, + "loss": 1.0386, + "step": 2060 + }, + { + "epoch": 2.7308707124010554, + "grad_norm": 0.4736326336860657, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 2070 + }, + { + "epoch": 2.7440633245382586, + "grad_norm": 0.5527601838111877, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 2080 + }, + { + "epoch": 2.757255936675462, + "grad_norm": 0.7295718193054199, + "learning_rate": 0.0002, + "loss": 0.9149, + "step": 2090 + }, + { + "epoch": 2.7704485488126647, + "grad_norm": 0.5437536835670471, + "learning_rate": 0.0002, + "loss": 0.9032, + "step": 2100 + }, + { + "epoch": 2.783641160949868, + "grad_norm": 0.5997128486633301, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2110 + }, + { + "epoch": 2.796833773087071, + "grad_norm": 0.6498191356658936, + "learning_rate": 0.0002, + "loss": 0.976, + "step": 2120 + }, + { + "epoch": 2.8100263852242744, + "grad_norm": 0.5237268805503845, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 2130 + }, + { + "epoch": 2.8232189973614776, + "grad_norm": 0.6033027172088623, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 2140 + }, + { + "epoch": 2.836411609498681, + "grad_norm": 0.6077138781547546, + "learning_rate": 0.0002, + "loss": 0.9625, + "step": 2150 + }, + { + "epoch": 2.849604221635884, + "grad_norm": 0.4127797484397888, + "learning_rate": 0.0002, + "loss": 0.9347, + "step": 2160 + }, + { + "epoch": 2.862796833773087, + "grad_norm": 0.8448635339736938, + "learning_rate": 0.0002, + "loss": 1.0459, + "step": 2170 + }, + { + "epoch": 2.87598944591029, + "grad_norm": 0.5669729113578796, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 2180 + }, + { + "epoch": 2.8891820580474934, + "grad_norm": 0.510231077671051, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 2190 + }, + { + "epoch": 2.9023746701846966, + "grad_norm": 0.8072245121002197, + "learning_rate": 0.0002, + "loss": 0.9267, + "step": 2200 + }, + { + "epoch": 2.9155672823219, + "grad_norm": 0.6055923104286194, + "learning_rate": 0.0002, + "loss": 0.9685, + "step": 2210 + }, + { + "epoch": 2.9287598944591027, + "grad_norm": 0.7384416460990906, + "learning_rate": 0.0002, + "loss": 0.9157, + "step": 2220 + }, + { + "epoch": 2.941952506596306, + "grad_norm": 0.4922751784324646, + "learning_rate": 0.0002, + "loss": 0.9712, + "step": 2230 + }, + { + "epoch": 2.955145118733509, + "grad_norm": 0.6039906740188599, + "learning_rate": 0.0002, + "loss": 0.9487, + "step": 2240 + }, + { + "epoch": 2.9683377308707124, + "grad_norm": 0.4751701354980469, + "learning_rate": 0.0002, + "loss": 0.9776, + "step": 2250 + }, + { + "epoch": 2.9815303430079156, + "grad_norm": 0.5698353052139282, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 2260 + }, + { + "epoch": 2.994722955145119, + "grad_norm": 0.893563449382782, + "learning_rate": 0.0002, + "loss": 1.1184, + "step": 2270 + }, + { + "epoch": 3.0, + "eval_loss": 1.2046419382095337, + "eval_runtime": 71.5992, + "eval_samples_per_second": 6.02, + "eval_steps_per_second": 0.754, + "step": 2274 + }, + { + "epoch": 3.007915567282322, + "grad_norm": 0.41119325160980225, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 2280 + }, + { + "epoch": 3.021108179419525, + "grad_norm": 0.8169420957565308, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 2290 + }, + { + "epoch": 3.034300791556728, + "grad_norm": 0.6033818125724792, + "learning_rate": 0.0002, + "loss": 0.794, + "step": 2300 + }, + { + "epoch": 3.0474934036939314, + "grad_norm": 0.9600058197975159, + "learning_rate": 0.0002, + "loss": 0.7607, + "step": 2310 + }, + { + "epoch": 3.0606860158311346, + "grad_norm": 0.5859250426292419, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 2320 + }, + { + "epoch": 3.073878627968338, + "grad_norm": 0.6758618950843811, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 2330 + }, + { + "epoch": 3.0870712401055407, + "grad_norm": 0.8407140970230103, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 2340 + }, + { + "epoch": 3.100263852242744, + "grad_norm": 0.767779529094696, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 2350 + }, + { + "epoch": 3.113456464379947, + "grad_norm": 0.5572896599769592, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 2360 + }, + { + "epoch": 3.1266490765171504, + "grad_norm": 0.5908368825912476, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 2370 + }, + { + "epoch": 3.1398416886543536, + "grad_norm": 0.8047826290130615, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 2380 + }, + { + "epoch": 3.153034300791557, + "grad_norm": 0.8041718006134033, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 2390 + }, + { + "epoch": 3.16622691292876, + "grad_norm": 0.57078617811203, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 2400 + }, + { + "epoch": 3.179419525065963, + "grad_norm": 0.5125322937965393, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 2410 + }, + { + "epoch": 3.192612137203166, + "grad_norm": 0.6356934309005737, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 2420 + }, + { + "epoch": 3.2058047493403694, + "grad_norm": 1.0129680633544922, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 2430 + }, + { + "epoch": 3.2189973614775726, + "grad_norm": 0.8104226589202881, + "learning_rate": 0.0002, + "loss": 0.8316, + "step": 2440 + }, + { + "epoch": 3.232189973614776, + "grad_norm": 0.7276079058647156, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 2450 + }, + { + "epoch": 3.2453825857519787, + "grad_norm": 0.9753884077072144, + "learning_rate": 0.0002, + "loss": 0.8183, + "step": 2460 + }, + { + "epoch": 3.258575197889182, + "grad_norm": 0.9753183722496033, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 2470 + }, + { + "epoch": 3.271767810026385, + "grad_norm": 0.6791225075721741, + "learning_rate": 0.0002, + "loss": 0.8815, + "step": 2480 + }, + { + "epoch": 3.2849604221635884, + "grad_norm": 0.6797150373458862, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 2490 + }, + { + "epoch": 3.2981530343007917, + "grad_norm": 0.8107194900512695, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 2500 + }, + { + "epoch": 3.311345646437995, + "grad_norm": 0.5878375172615051, + "learning_rate": 0.0002, + "loss": 0.7869, + "step": 2510 + }, + { + "epoch": 3.324538258575198, + "grad_norm": 0.5882975459098816, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 2520 + }, + { + "epoch": 3.337730870712401, + "grad_norm": 0.6180013418197632, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 2530 + }, + { + "epoch": 3.350923482849604, + "grad_norm": 1.0008151531219482, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 2540 + }, + { + "epoch": 3.3641160949868074, + "grad_norm": 0.6404656767845154, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 2550 + }, + { + "epoch": 3.3773087071240107, + "grad_norm": 0.8481354117393494, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 2560 + }, + { + "epoch": 3.390501319261214, + "grad_norm": 0.8068035244941711, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 2570 + }, + { + "epoch": 3.4036939313984167, + "grad_norm": 0.7477166056632996, + "learning_rate": 0.0002, + "loss": 0.8621, + "step": 2580 + }, + { + "epoch": 3.41688654353562, + "grad_norm": 0.6202635765075684, + "learning_rate": 0.0002, + "loss": 0.8352, + "step": 2590 + }, + { + "epoch": 3.430079155672823, + "grad_norm": 0.6981159448623657, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 2600 + }, + { + "epoch": 3.4432717678100264, + "grad_norm": 0.6611084342002869, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 2610 + }, + { + "epoch": 3.4564643799472297, + "grad_norm": 0.5727696418762207, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 2620 + }, + { + "epoch": 3.469656992084433, + "grad_norm": 1.2354545593261719, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 2630 + }, + { + "epoch": 3.4828496042216357, + "grad_norm": 0.6347638368606567, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 2640 + }, + { + "epoch": 3.496042216358839, + "grad_norm": 0.6975704431533813, + "learning_rate": 0.0002, + "loss": 0.8426, + "step": 2650 + }, + { + "epoch": 3.509234828496042, + "grad_norm": 0.6569573879241943, + "learning_rate": 0.0002, + "loss": 0.8773, + "step": 2660 + }, + { + "epoch": 3.5224274406332454, + "grad_norm": 0.6979609131813049, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 2670 + }, + { + "epoch": 3.5356200527704487, + "grad_norm": 0.6287988424301147, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2680 + }, + { + "epoch": 3.5488126649076515, + "grad_norm": 0.8682637214660645, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 2690 + }, + { + "epoch": 3.5620052770448547, + "grad_norm": 0.7062831521034241, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 2700 + }, + { + "epoch": 3.575197889182058, + "grad_norm": 1.0061452388763428, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 2710 + }, + { + "epoch": 3.588390501319261, + "grad_norm": 0.719097375869751, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 2720 + }, + { + "epoch": 3.6015831134564644, + "grad_norm": 0.7583496570587158, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 2730 + }, + { + "epoch": 3.6147757255936677, + "grad_norm": 0.7543531060218811, + "learning_rate": 0.0002, + "loss": 0.91, + "step": 2740 + }, + { + "epoch": 3.627968337730871, + "grad_norm": 0.8873646855354309, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2750 + }, + { + "epoch": 3.641160949868074, + "grad_norm": 1.0657562017440796, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 2760 + }, + { + "epoch": 3.654353562005277, + "grad_norm": 0.8641113638877869, + "learning_rate": 0.0002, + "loss": 0.8291, + "step": 2770 + }, + { + "epoch": 3.66754617414248, + "grad_norm": 0.6620645523071289, + "learning_rate": 0.0002, + "loss": 0.8302, + "step": 2780 + }, + { + "epoch": 3.6807387862796834, + "grad_norm": 0.6919541954994202, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 2790 + }, + { + "epoch": 3.6939313984168867, + "grad_norm": 0.7305743098258972, + "learning_rate": 0.0002, + "loss": 0.8388, + "step": 2800 + }, + { + "epoch": 3.7071240105540895, + "grad_norm": 0.7464777827262878, + "learning_rate": 0.0002, + "loss": 0.8053, + "step": 2810 + }, + { + "epoch": 3.7203166226912927, + "grad_norm": 0.8067063093185425, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 2820 + }, + { + "epoch": 3.733509234828496, + "grad_norm": 0.7789416313171387, + "learning_rate": 0.0002, + "loss": 0.8259, + "step": 2830 + }, + { + "epoch": 3.746701846965699, + "grad_norm": 0.507529079914093, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 2840 + }, + { + "epoch": 3.7598944591029024, + "grad_norm": 0.6509260535240173, + "learning_rate": 0.0002, + "loss": 0.832, + "step": 2850 + }, + { + "epoch": 3.7730870712401057, + "grad_norm": 0.9141367673873901, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 2860 + }, + { + "epoch": 3.786279683377309, + "grad_norm": 0.7852635979652405, + "learning_rate": 0.0002, + "loss": 0.9436, + "step": 2870 + }, + { + "epoch": 3.7994722955145117, + "grad_norm": 0.5340318083763123, + "learning_rate": 0.0002, + "loss": 0.8842, + "step": 2880 + }, + { + "epoch": 3.812664907651715, + "grad_norm": 0.6246042847633362, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 2890 + }, + { + "epoch": 3.825857519788918, + "grad_norm": 0.7064066529273987, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 2900 + }, + { + "epoch": 3.8390501319261214, + "grad_norm": 0.6144065856933594, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 2910 + }, + { + "epoch": 3.8522427440633247, + "grad_norm": 0.5268424153327942, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 2920 + }, + { + "epoch": 3.8654353562005275, + "grad_norm": 0.9508116841316223, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 2930 + }, + { + "epoch": 3.8786279683377307, + "grad_norm": 0.9133715629577637, + "learning_rate": 0.0002, + "loss": 0.8437, + "step": 2940 + }, + { + "epoch": 3.891820580474934, + "grad_norm": 1.0144646167755127, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 2950 + }, + { + "epoch": 3.905013192612137, + "grad_norm": 0.6397877931594849, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2960 + }, + { + "epoch": 3.9182058047493404, + "grad_norm": 0.734835147857666, + "learning_rate": 0.0002, + "loss": 0.8285, + "step": 2970 + }, + { + "epoch": 3.9313984168865437, + "grad_norm": 0.784853994846344, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 2980 + }, + { + "epoch": 3.944591029023747, + "grad_norm": 0.805831789970398, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 2990 + }, + { + "epoch": 3.9577836411609497, + "grad_norm": 0.6299595236778259, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 3000 + }, + { + "epoch": 3.970976253298153, + "grad_norm": 0.6264058351516724, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3010 + }, + { + "epoch": 3.984168865435356, + "grad_norm": 0.6419739723205566, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 3020 + }, + { + "epoch": 3.9973614775725594, + "grad_norm": 0.7737036943435669, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 3030 + }, + { + "epoch": 4.0, + "eval_loss": 1.2454297542572021, + "eval_runtime": 71.8558, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.752, + "step": 3032 + }, + { + "epoch": 4.010554089709762, + "grad_norm": 1.092727541923523, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 3040 + }, + { + "epoch": 4.0237467018469655, + "grad_norm": 0.8087759613990784, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 3050 + }, + { + "epoch": 4.036939313984169, + "grad_norm": 0.8106053471565247, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 3060 + }, + { + "epoch": 4.050131926121372, + "grad_norm": 0.8675326704978943, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 3070 + }, + { + "epoch": 4.063324538258575, + "grad_norm": 0.9620490074157715, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 3080 + }, + { + "epoch": 4.076517150395778, + "grad_norm": 0.8996296525001526, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 3090 + }, + { + "epoch": 4.089709762532982, + "grad_norm": 0.8648998737335205, + "learning_rate": 0.0002, + "loss": 0.6111, + "step": 3100 + }, + { + "epoch": 4.102902374670185, + "grad_norm": 1.0321335792541504, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 3110 + }, + { + "epoch": 4.116094986807388, + "grad_norm": 0.7949225306510925, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 3120 + }, + { + "epoch": 4.129287598944591, + "grad_norm": 0.9684646129608154, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 3130 + }, + { + "epoch": 4.142480211081795, + "grad_norm": 0.8698066473007202, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 3140 + }, + { + "epoch": 4.155672823218997, + "grad_norm": 0.7688450813293457, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 3150 + }, + { + "epoch": 4.1688654353562, + "grad_norm": 0.9682092070579529, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 3160 + }, + { + "epoch": 4.1820580474934035, + "grad_norm": 0.961561918258667, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 3170 + }, + { + "epoch": 4.195250659630607, + "grad_norm": 1.3962990045547485, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 3180 + }, + { + "epoch": 4.20844327176781, + "grad_norm": 0.9485045075416565, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 3190 + }, + { + "epoch": 4.221635883905013, + "grad_norm": 0.7768281698226929, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 3200 + }, + { + "epoch": 4.2348284960422165, + "grad_norm": 1.2685691118240356, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 3210 + }, + { + "epoch": 4.24802110817942, + "grad_norm": 0.6876471638679504, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 3220 + }, + { + "epoch": 4.261213720316623, + "grad_norm": 1.0074554681777954, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 3230 + }, + { + "epoch": 4.274406332453826, + "grad_norm": 0.8094777464866638, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 3240 + }, + { + "epoch": 4.287598944591029, + "grad_norm": 0.7906569242477417, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 3250 + }, + { + "epoch": 4.300791556728232, + "grad_norm": 0.840238630771637, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 3260 + }, + { + "epoch": 4.313984168865435, + "grad_norm": 1.0119295120239258, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 3270 + }, + { + "epoch": 4.327176781002638, + "grad_norm": 0.7943191528320312, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 3280 + }, + { + "epoch": 4.3403693931398415, + "grad_norm": 0.7691723704338074, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 3290 + }, + { + "epoch": 4.353562005277045, + "grad_norm": 0.7227770686149597, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 3300 + }, + { + "epoch": 4.366754617414248, + "grad_norm": 0.8512253165245056, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 3310 + }, + { + "epoch": 4.379947229551451, + "grad_norm": 0.7852529287338257, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 3320 + }, + { + "epoch": 4.3931398416886545, + "grad_norm": 0.8888797163963318, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 3330 + }, + { + "epoch": 4.406332453825858, + "grad_norm": 0.9522430896759033, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 3340 + }, + { + "epoch": 4.419525065963061, + "grad_norm": 0.900276780128479, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 3350 + }, + { + "epoch": 4.432717678100264, + "grad_norm": 1.181547999382019, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 3360 + }, + { + "epoch": 4.445910290237467, + "grad_norm": 0.903142511844635, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 3370 + }, + { + "epoch": 4.45910290237467, + "grad_norm": 0.8747565150260925, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 3380 + }, + { + "epoch": 4.472295514511873, + "grad_norm": 0.7838051319122314, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 3390 + }, + { + "epoch": 4.485488126649076, + "grad_norm": 0.8691313862800598, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 3400 + }, + { + "epoch": 4.4986807387862795, + "grad_norm": 0.8493868708610535, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 3410 + }, + { + "epoch": 4.511873350923483, + "grad_norm": 1.0104830265045166, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 3420 + }, + { + "epoch": 4.525065963060686, + "grad_norm": 1.1716967821121216, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 3430 + }, + { + "epoch": 4.538258575197889, + "grad_norm": 0.9122593998908997, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 3440 + }, + { + "epoch": 4.5514511873350925, + "grad_norm": 0.829090416431427, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 3450 + }, + { + "epoch": 4.564643799472296, + "grad_norm": 1.141662836074829, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 3460 + }, + { + "epoch": 4.577836411609499, + "grad_norm": 0.8423182368278503, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 3470 + }, + { + "epoch": 4.591029023746702, + "grad_norm": 0.8024184703826904, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 3480 + }, + { + "epoch": 4.6042216358839045, + "grad_norm": 0.7703381776809692, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 3490 + }, + { + "epoch": 4.617414248021108, + "grad_norm": 0.9883959293365479, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 3500 + }, + { + "epoch": 4.630606860158311, + "grad_norm": 0.9554709196090698, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 3510 + }, + { + "epoch": 4.643799472295514, + "grad_norm": 1.9949709177017212, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 3520 + }, + { + "epoch": 4.6569920844327175, + "grad_norm": 0.7762255072593689, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 3530 + }, + { + "epoch": 4.670184696569921, + "grad_norm": 0.9538425803184509, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 3540 + }, + { + "epoch": 4.683377308707124, + "grad_norm": 1.0279661417007446, + "learning_rate": 0.0002, + "loss": 0.6352, + "step": 3550 + }, + { + "epoch": 4.696569920844327, + "grad_norm": 0.7545472979545593, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 3560 + }, + { + "epoch": 4.7097625329815305, + "grad_norm": 0.8919376730918884, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 3570 + }, + { + "epoch": 4.722955145118734, + "grad_norm": 0.7621569633483887, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 3580 + }, + { + "epoch": 4.736147757255937, + "grad_norm": 1.205320119857788, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3590 + }, + { + "epoch": 4.74934036939314, + "grad_norm": 1.0642725229263306, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 3600 + }, + { + "epoch": 4.762532981530343, + "grad_norm": 0.9402666687965393, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 3610 + }, + { + "epoch": 4.775725593667546, + "grad_norm": 1.254127025604248, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 3620 + }, + { + "epoch": 4.788918205804749, + "grad_norm": 0.7609598636627197, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 3630 + }, + { + "epoch": 4.802110817941952, + "grad_norm": 0.8240329623222351, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 3640 + }, + { + "epoch": 4.8153034300791555, + "grad_norm": 0.8356260657310486, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 3650 + }, + { + "epoch": 4.828496042216359, + "grad_norm": 0.9130708575248718, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 3660 + }, + { + "epoch": 4.841688654353562, + "grad_norm": 0.9384765028953552, + "learning_rate": 0.0002, + "loss": 0.7269, + "step": 3670 + }, + { + "epoch": 4.854881266490765, + "grad_norm": 0.9829966425895691, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 3680 + }, + { + "epoch": 4.8680738786279685, + "grad_norm": 1.0488632917404175, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 3690 + }, + { + "epoch": 4.881266490765172, + "grad_norm": 1.2278969287872314, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 3700 + }, + { + "epoch": 4.894459102902375, + "grad_norm": 0.8078970313072205, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 3710 + }, + { + "epoch": 4.907651715039578, + "grad_norm": 0.8081700205802917, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 3720 + }, + { + "epoch": 4.9208443271767806, + "grad_norm": 0.9204511046409607, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 3730 + }, + { + "epoch": 4.934036939313984, + "grad_norm": 0.9326391220092773, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 3740 + }, + { + "epoch": 4.947229551451187, + "grad_norm": 1.0089969635009766, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 3750 + }, + { + "epoch": 4.96042216358839, + "grad_norm": 0.7063466906547546, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 3760 + }, + { + "epoch": 4.9736147757255935, + "grad_norm": 1.2603905200958252, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 3770 + }, + { + "epoch": 4.986807387862797, + "grad_norm": 0.8418653607368469, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 3780 + }, + { + "epoch": 5.0, + "grad_norm": 0.9537181854248047, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 3790 + }, + { + "epoch": 5.0, + "eval_loss": 1.3319307565689087, + "eval_runtime": 71.7836, + "eval_samples_per_second": 6.004, + "eval_steps_per_second": 0.752, + "step": 3790 + }, + { + "epoch": 5.013192612137203, + "grad_norm": 0.8595899343490601, + "learning_rate": 0.0002, + "loss": 0.489, + "step": 3800 + }, + { + "epoch": 5.0263852242744065, + "grad_norm": 1.0023565292358398, + "learning_rate": 0.0002, + "loss": 0.5155, + "step": 3810 + }, + { + "epoch": 5.03957783641161, + "grad_norm": 1.2770460844039917, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 3820 + }, + { + "epoch": 5.052770448548813, + "grad_norm": 1.1701956987380981, + "learning_rate": 0.0002, + "loss": 0.5127, + "step": 3830 + }, + { + "epoch": 5.065963060686016, + "grad_norm": 0.812269926071167, + "learning_rate": 0.0002, + "loss": 0.5057, + "step": 3840 + }, + { + "epoch": 5.0791556728232194, + "grad_norm": 0.8186697363853455, + "learning_rate": 0.0002, + "loss": 0.4292, + "step": 3850 + }, + { + "epoch": 5.092348284960422, + "grad_norm": 1.052565097808838, + "learning_rate": 0.0002, + "loss": 0.4865, + "step": 3860 + }, + { + "epoch": 5.105540897097625, + "grad_norm": 0.9764705300331116, + "learning_rate": 0.0002, + "loss": 0.4947, + "step": 3870 + }, + { + "epoch": 5.118733509234828, + "grad_norm": 0.6973426938056946, + "learning_rate": 0.0002, + "loss": 0.471, + "step": 3880 + }, + { + "epoch": 5.1319261213720315, + "grad_norm": 1.2127928733825684, + "learning_rate": 0.0002, + "loss": 0.5565, + "step": 3890 + }, + { + "epoch": 5.145118733509235, + "grad_norm": 0.682807981967926, + "learning_rate": 0.0002, + "loss": 0.4122, + "step": 3900 + }, + { + "epoch": 5.158311345646438, + "grad_norm": 1.3575998544692993, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 3910 + }, + { + "epoch": 5.171503957783641, + "grad_norm": 1.2581931352615356, + "learning_rate": 0.0002, + "loss": 0.4624, + "step": 3920 + }, + { + "epoch": 5.1846965699208445, + "grad_norm": 1.0493637323379517, + "learning_rate": 0.0002, + "loss": 0.5092, + "step": 3930 + }, + { + "epoch": 5.197889182058048, + "grad_norm": 1.3519670963287354, + "learning_rate": 0.0002, + "loss": 0.4563, + "step": 3940 + }, + { + "epoch": 5.211081794195251, + "grad_norm": 1.0690566301345825, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 3950 + }, + { + "epoch": 5.224274406332454, + "grad_norm": 1.1171330213546753, + "learning_rate": 0.0002, + "loss": 0.5038, + "step": 3960 + }, + { + "epoch": 5.237467018469657, + "grad_norm": 1.055851697921753, + "learning_rate": 0.0002, + "loss": 0.4397, + "step": 3970 + }, + { + "epoch": 5.25065963060686, + "grad_norm": 0.8870180249214172, + "learning_rate": 0.0002, + "loss": 0.4964, + "step": 3980 + }, + { + "epoch": 5.263852242744063, + "grad_norm": 0.9688402414321899, + "learning_rate": 0.0002, + "loss": 0.5353, + "step": 3990 + }, + { + "epoch": 5.277044854881266, + "grad_norm": 0.8458422422409058, + "learning_rate": 0.0002, + "loss": 0.5192, + "step": 4000 + }, + { + "epoch": 5.2902374670184695, + "grad_norm": 0.908256471157074, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 4010 + }, + { + "epoch": 5.303430079155673, + "grad_norm": 1.0058149099349976, + "learning_rate": 0.0002, + "loss": 0.5102, + "step": 4020 + }, + { + "epoch": 5.316622691292876, + "grad_norm": 1.20364511013031, + "learning_rate": 0.0002, + "loss": 0.5322, + "step": 4030 + }, + { + "epoch": 5.329815303430079, + "grad_norm": 1.0135732889175415, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 4040 + }, + { + "epoch": 5.3430079155672825, + "grad_norm": 1.1094907522201538, + "learning_rate": 0.0002, + "loss": 0.4736, + "step": 4050 + }, + { + "epoch": 5.356200527704486, + "grad_norm": 1.0373083353042603, + "learning_rate": 0.0002, + "loss": 0.4912, + "step": 4060 + }, + { + "epoch": 5.369393139841689, + "grad_norm": 1.0952966213226318, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 4070 + }, + { + "epoch": 5.382585751978892, + "grad_norm": 1.1734952926635742, + "learning_rate": 0.0002, + "loss": 0.4892, + "step": 4080 + }, + { + "epoch": 5.395778364116095, + "grad_norm": 0.8217245936393738, + "learning_rate": 0.0002, + "loss": 0.4463, + "step": 4090 + }, + { + "epoch": 5.408970976253298, + "grad_norm": 1.0936307907104492, + "learning_rate": 0.0002, + "loss": 0.5271, + "step": 4100 + }, + { + "epoch": 5.422163588390501, + "grad_norm": 1.0198720693588257, + "learning_rate": 0.0002, + "loss": 0.509, + "step": 4110 + }, + { + "epoch": 5.435356200527704, + "grad_norm": 1.1105809211730957, + "learning_rate": 0.0002, + "loss": 0.5265, + "step": 4120 + }, + { + "epoch": 5.4485488126649075, + "grad_norm": 1.1817213296890259, + "learning_rate": 0.0002, + "loss": 0.4871, + "step": 4130 + }, + { + "epoch": 5.461741424802111, + "grad_norm": 1.126339077949524, + "learning_rate": 0.0002, + "loss": 0.4987, + "step": 4140 + }, + { + "epoch": 5.474934036939314, + "grad_norm": 0.9467914700508118, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 4150 + }, + { + "epoch": 5.488126649076517, + "grad_norm": 1.0335774421691895, + "learning_rate": 0.0002, + "loss": 0.5386, + "step": 4160 + }, + { + "epoch": 5.5013192612137205, + "grad_norm": 0.866211473941803, + "learning_rate": 0.0002, + "loss": 0.5122, + "step": 4170 + }, + { + "epoch": 5.514511873350924, + "grad_norm": 0.7422948479652405, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 4180 + }, + { + "epoch": 5.527704485488127, + "grad_norm": 1.2211135625839233, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 4190 + }, + { + "epoch": 5.540897097625329, + "grad_norm": 1.0371766090393066, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 4200 + }, + { + "epoch": 5.554089709762533, + "grad_norm": 0.9460630416870117, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 4210 + }, + { + "epoch": 5.567282321899736, + "grad_norm": 0.7972197532653809, + "learning_rate": 0.0002, + "loss": 0.4645, + "step": 4220 + }, + { + "epoch": 5.580474934036939, + "grad_norm": 1.0654675960540771, + "learning_rate": 0.0002, + "loss": 0.5087, + "step": 4230 + }, + { + "epoch": 5.593667546174142, + "grad_norm": 1.0776735544204712, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 4240 + }, + { + "epoch": 5.6068601583113455, + "grad_norm": 1.498723030090332, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 4250 + }, + { + "epoch": 5.620052770448549, + "grad_norm": 1.006768822669983, + "learning_rate": 0.0002, + "loss": 0.4788, + "step": 4260 + }, + { + "epoch": 5.633245382585752, + "grad_norm": 0.9194242358207703, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 4270 + }, + { + "epoch": 5.646437994722955, + "grad_norm": 1.1028380393981934, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 4280 + }, + { + "epoch": 5.6596306068601585, + "grad_norm": 0.9972755312919617, + "learning_rate": 0.0002, + "loss": 0.5319, + "step": 4290 + }, + { + "epoch": 5.672823218997362, + "grad_norm": 1.0509438514709473, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 4300 + }, + { + "epoch": 5.686015831134565, + "grad_norm": 1.064039945602417, + "learning_rate": 0.0002, + "loss": 0.4738, + "step": 4310 + }, + { + "epoch": 5.699208443271768, + "grad_norm": 0.9572229981422424, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 4320 + }, + { + "epoch": 5.7124010554089715, + "grad_norm": 0.9956564903259277, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 4330 + }, + { + "epoch": 5.725593667546174, + "grad_norm": 1.01974618434906, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 4340 + }, + { + "epoch": 5.738786279683377, + "grad_norm": 1.101328730583191, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 4350 + }, + { + "epoch": 5.75197889182058, + "grad_norm": 0.9971756935119629, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 4360 + }, + { + "epoch": 5.7651715039577835, + "grad_norm": 0.8579474687576294, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 4370 + }, + { + "epoch": 5.778364116094987, + "grad_norm": 0.9927367568016052, + "learning_rate": 0.0002, + "loss": 0.5106, + "step": 4380 + }, + { + "epoch": 5.79155672823219, + "grad_norm": 1.1183884143829346, + "learning_rate": 0.0002, + "loss": 0.5354, + "step": 4390 + }, + { + "epoch": 5.804749340369393, + "grad_norm": 0.7695905566215515, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 4400 + }, + { + "epoch": 5.8179419525065965, + "grad_norm": 1.1102122068405151, + "learning_rate": 0.0002, + "loss": 0.5137, + "step": 4410 + }, + { + "epoch": 5.8311345646438, + "grad_norm": 1.3201336860656738, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 4420 + }, + { + "epoch": 5.844327176781003, + "grad_norm": 1.1934558153152466, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 4430 + }, + { + "epoch": 5.857519788918205, + "grad_norm": 1.390870451927185, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 4440 + }, + { + "epoch": 5.870712401055409, + "grad_norm": 1.056314468383789, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 4450 + }, + { + "epoch": 5.883905013192612, + "grad_norm": 0.9797437191009521, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 4460 + }, + { + "epoch": 5.897097625329815, + "grad_norm": 1.2368146181106567, + "learning_rate": 0.0002, + "loss": 0.5479, + "step": 4470 + }, + { + "epoch": 5.910290237467018, + "grad_norm": 0.9062654376029968, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 4480 + }, + { + "epoch": 5.923482849604222, + "grad_norm": 1.8643536567687988, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 4490 + }, + { + "epoch": 5.936675461741425, + "grad_norm": 1.2977997064590454, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 4500 + }, + { + "epoch": 5.949868073878628, + "grad_norm": 0.8366201519966125, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 4510 + }, + { + "epoch": 5.963060686015831, + "grad_norm": 1.0210131406784058, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 4520 + }, + { + "epoch": 5.9762532981530345, + "grad_norm": 1.1287827491760254, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 4530 + }, + { + "epoch": 5.989445910290238, + "grad_norm": 1.0480493307113647, + "learning_rate": 0.0002, + "loss": 0.5053, + "step": 4540 + }, + { + "epoch": 6.0, + "eval_loss": 1.450880765914917, + "eval_runtime": 71.8135, + "eval_samples_per_second": 6.002, + "eval_steps_per_second": 0.752, + "step": 4548 + }, + { + "epoch": 6.002638522427441, + "grad_norm": 0.8589069247245789, + "learning_rate": 0.0002, + "loss": 0.5072, + "step": 4550 + }, + { + "epoch": 6.015831134564644, + "grad_norm": 1.467134714126587, + "learning_rate": 0.0002, + "loss": 0.4129, + "step": 4560 + }, + { + "epoch": 6.029023746701847, + "grad_norm": 1.1477625370025635, + "learning_rate": 0.0002, + "loss": 0.3739, + "step": 4570 + }, + { + "epoch": 6.04221635883905, + "grad_norm": 1.4254094362258911, + "learning_rate": 0.0002, + "loss": 0.3958, + "step": 4580 + }, + { + "epoch": 6.055408970976253, + "grad_norm": 1.3656290769577026, + "learning_rate": 0.0002, + "loss": 0.356, + "step": 4590 + }, + { + "epoch": 6.068601583113456, + "grad_norm": 0.9638674855232239, + "learning_rate": 0.0002, + "loss": 0.3626, + "step": 4600 + }, + { + "epoch": 6.08179419525066, + "grad_norm": 1.2654615640640259, + "learning_rate": 0.0002, + "loss": 0.3884, + "step": 4610 + }, + { + "epoch": 6.094986807387863, + "grad_norm": 1.4506969451904297, + "learning_rate": 0.0002, + "loss": 0.4659, + "step": 4620 + }, + { + "epoch": 6.108179419525066, + "grad_norm": 1.6596732139587402, + "learning_rate": 0.0002, + "loss": 0.3096, + "step": 4630 + }, + { + "epoch": 6.121372031662269, + "grad_norm": 1.5335280895233154, + "learning_rate": 0.0002, + "loss": 0.4005, + "step": 4640 + }, + { + "epoch": 6.1345646437994725, + "grad_norm": 1.0815565586090088, + "learning_rate": 0.0002, + "loss": 0.3999, + "step": 4650 + }, + { + "epoch": 6.147757255936676, + "grad_norm": 0.9995638132095337, + "learning_rate": 0.0002, + "loss": 0.4026, + "step": 4660 + }, + { + "epoch": 6.160949868073879, + "grad_norm": 0.8809106349945068, + "learning_rate": 0.0002, + "loss": 0.3548, + "step": 4670 + }, + { + "epoch": 6.174142480211081, + "grad_norm": 1.2946726083755493, + "learning_rate": 0.0002, + "loss": 0.4505, + "step": 4680 + }, + { + "epoch": 6.187335092348285, + "grad_norm": 1.311298131942749, + "learning_rate": 0.0002, + "loss": 0.4447, + "step": 4690 + }, + { + "epoch": 6.200527704485488, + "grad_norm": 1.229204535484314, + "learning_rate": 0.0002, + "loss": 0.4108, + "step": 4700 + }, + { + "epoch": 6.213720316622691, + "grad_norm": 1.0193822383880615, + "learning_rate": 0.0002, + "loss": 0.3764, + "step": 4710 + }, + { + "epoch": 6.226912928759894, + "grad_norm": 1.4438618421554565, + "learning_rate": 0.0002, + "loss": 0.3696, + "step": 4720 + }, + { + "epoch": 6.240105540897098, + "grad_norm": 1.4315637350082397, + "learning_rate": 0.0002, + "loss": 0.3979, + "step": 4730 + }, + { + "epoch": 6.253298153034301, + "grad_norm": 1.1291239261627197, + "learning_rate": 0.0002, + "loss": 0.4124, + "step": 4740 + }, + { + "epoch": 6.266490765171504, + "grad_norm": 0.9358022809028625, + "learning_rate": 0.0002, + "loss": 0.4337, + "step": 4750 + }, + { + "epoch": 6.279683377308707, + "grad_norm": 1.1260714530944824, + "learning_rate": 0.0002, + "loss": 0.3758, + "step": 4760 + }, + { + "epoch": 6.2928759894459105, + "grad_norm": 1.5400320291519165, + "learning_rate": 0.0002, + "loss": 0.4262, + "step": 4770 + }, + { + "epoch": 6.306068601583114, + "grad_norm": 1.6820714473724365, + "learning_rate": 0.0002, + "loss": 0.4105, + "step": 4780 + }, + { + "epoch": 6.319261213720317, + "grad_norm": 1.1937718391418457, + "learning_rate": 0.0002, + "loss": 0.4192, + "step": 4790 + }, + { + "epoch": 6.33245382585752, + "grad_norm": 1.4330145120620728, + "learning_rate": 0.0002, + "loss": 0.4519, + "step": 4800 + }, + { + "epoch": 6.345646437994723, + "grad_norm": 1.083373785018921, + "learning_rate": 0.0002, + "loss": 0.4173, + "step": 4810 + }, + { + "epoch": 6.358839050131926, + "grad_norm": 1.3013869524002075, + "learning_rate": 0.0002, + "loss": 0.4054, + "step": 4820 + }, + { + "epoch": 6.372031662269129, + "grad_norm": 1.1075547933578491, + "learning_rate": 0.0002, + "loss": 0.4177, + "step": 4830 + }, + { + "epoch": 6.385224274406332, + "grad_norm": 1.0480214357376099, + "learning_rate": 0.0002, + "loss": 0.3846, + "step": 4840 + }, + { + "epoch": 6.398416886543536, + "grad_norm": 1.3625658750534058, + "learning_rate": 0.0002, + "loss": 0.3924, + "step": 4850 + }, + { + "epoch": 6.411609498680739, + "grad_norm": 1.16606605052948, + "learning_rate": 0.0002, + "loss": 0.3964, + "step": 4860 + }, + { + "epoch": 6.424802110817942, + "grad_norm": 1.2435568571090698, + "learning_rate": 0.0002, + "loss": 0.4845, + "step": 4870 + }, + { + "epoch": 6.437994722955145, + "grad_norm": 1.4471954107284546, + "learning_rate": 0.0002, + "loss": 0.3847, + "step": 4880 + }, + { + "epoch": 6.4511873350923485, + "grad_norm": 1.2302275896072388, + "learning_rate": 0.0002, + "loss": 0.443, + "step": 4890 + }, + { + "epoch": 6.464379947229552, + "grad_norm": 1.2392226457595825, + "learning_rate": 0.0002, + "loss": 0.4458, + "step": 4900 + }, + { + "epoch": 6.477572559366755, + "grad_norm": 1.0497277975082397, + "learning_rate": 0.0002, + "loss": 0.4114, + "step": 4910 + }, + { + "epoch": 6.490765171503957, + "grad_norm": 1.3509557247161865, + "learning_rate": 0.0002, + "loss": 0.426, + "step": 4920 + }, + { + "epoch": 6.503957783641161, + "grad_norm": 1.340214729309082, + "learning_rate": 0.0002, + "loss": 0.4089, + "step": 4930 + }, + { + "epoch": 6.517150395778364, + "grad_norm": 1.283220648765564, + "learning_rate": 0.0002, + "loss": 0.4655, + "step": 4940 + }, + { + "epoch": 6.530343007915567, + "grad_norm": 1.0693278312683105, + "learning_rate": 0.0002, + "loss": 0.4205, + "step": 4950 + }, + { + "epoch": 6.54353562005277, + "grad_norm": 1.307997226715088, + "learning_rate": 0.0002, + "loss": 0.398, + "step": 4960 + }, + { + "epoch": 6.556728232189974, + "grad_norm": 1.1739027500152588, + "learning_rate": 0.0002, + "loss": 0.3844, + "step": 4970 + }, + { + "epoch": 6.569920844327177, + "grad_norm": 1.5694327354431152, + "learning_rate": 0.0002, + "loss": 0.4494, + "step": 4980 + }, + { + "epoch": 6.58311345646438, + "grad_norm": 0.9978346824645996, + "learning_rate": 0.0002, + "loss": 0.4535, + "step": 4990 + }, + { + "epoch": 6.596306068601583, + "grad_norm": 1.183057427406311, + "learning_rate": 0.0002, + "loss": 0.4755, + "step": 5000 + }, + { + "epoch": 6.6094986807387865, + "grad_norm": 1.1033718585968018, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 5010 + }, + { + "epoch": 6.62269129287599, + "grad_norm": 1.0699188709259033, + "learning_rate": 0.0002, + "loss": 0.4233, + "step": 5020 + }, + { + "epoch": 6.635883905013193, + "grad_norm": 1.491031289100647, + "learning_rate": 0.0002, + "loss": 0.4049, + "step": 5030 + }, + { + "epoch": 6.649076517150396, + "grad_norm": 0.7939618825912476, + "learning_rate": 0.0002, + "loss": 0.4257, + "step": 5040 + }, + { + "epoch": 6.662269129287599, + "grad_norm": 1.2883116006851196, + "learning_rate": 0.0002, + "loss": 0.4273, + "step": 5050 + }, + { + "epoch": 6.675461741424802, + "grad_norm": 1.3844388723373413, + "learning_rate": 0.0002, + "loss": 0.4376, + "step": 5060 + }, + { + "epoch": 6.688654353562005, + "grad_norm": 1.1823489665985107, + "learning_rate": 0.0002, + "loss": 0.4078, + "step": 5070 + }, + { + "epoch": 6.701846965699208, + "grad_norm": 1.310214638710022, + "learning_rate": 0.0002, + "loss": 0.4811, + "step": 5080 + }, + { + "epoch": 6.715039577836412, + "grad_norm": 1.6253955364227295, + "learning_rate": 0.0002, + "loss": 0.4675, + "step": 5090 + }, + { + "epoch": 6.728232189973615, + "grad_norm": 1.3344792127609253, + "learning_rate": 0.0002, + "loss": 0.4749, + "step": 5100 + }, + { + "epoch": 6.741424802110818, + "grad_norm": 1.3900614976882935, + "learning_rate": 0.0002, + "loss": 0.4051, + "step": 5110 + }, + { + "epoch": 6.754617414248021, + "grad_norm": 1.5122374296188354, + "learning_rate": 0.0002, + "loss": 0.3782, + "step": 5120 + }, + { + "epoch": 6.7678100263852246, + "grad_norm": 1.4738229513168335, + "learning_rate": 0.0002, + "loss": 0.4439, + "step": 5130 + }, + { + "epoch": 6.781002638522428, + "grad_norm": 1.0417664051055908, + "learning_rate": 0.0002, + "loss": 0.4237, + "step": 5140 + }, + { + "epoch": 6.79419525065963, + "grad_norm": 1.1339401006698608, + "learning_rate": 0.0002, + "loss": 0.486, + "step": 5150 + }, + { + "epoch": 6.807387862796833, + "grad_norm": 1.4377150535583496, + "learning_rate": 0.0002, + "loss": 0.4387, + "step": 5160 + }, + { + "epoch": 6.820580474934037, + "grad_norm": 1.3321975469589233, + "learning_rate": 0.0002, + "loss": 0.4375, + "step": 5170 + }, + { + "epoch": 6.83377308707124, + "grad_norm": 1.3799545764923096, + "learning_rate": 0.0002, + "loss": 0.4369, + "step": 5180 + }, + { + "epoch": 6.846965699208443, + "grad_norm": 0.864224374294281, + "learning_rate": 0.0002, + "loss": 0.4266, + "step": 5190 + }, + { + "epoch": 6.860158311345646, + "grad_norm": 1.0666139125823975, + "learning_rate": 0.0002, + "loss": 0.4455, + "step": 5200 + }, + { + "epoch": 6.87335092348285, + "grad_norm": 1.2926141023635864, + "learning_rate": 0.0002, + "loss": 0.4545, + "step": 5210 + }, + { + "epoch": 6.886543535620053, + "grad_norm": 1.2046207189559937, + "learning_rate": 0.0002, + "loss": 0.4441, + "step": 5220 + }, + { + "epoch": 6.899736147757256, + "grad_norm": 1.3961530923843384, + "learning_rate": 0.0002, + "loss": 0.4458, + "step": 5230 + }, + { + "epoch": 6.912928759894459, + "grad_norm": 1.1340336799621582, + "learning_rate": 0.0002, + "loss": 0.4343, + "step": 5240 + }, + { + "epoch": 6.926121372031663, + "grad_norm": 1.1756815910339355, + "learning_rate": 0.0002, + "loss": 0.4491, + "step": 5250 + }, + { + "epoch": 6.939313984168866, + "grad_norm": 1.146964192390442, + "learning_rate": 0.0002, + "loss": 0.4077, + "step": 5260 + }, + { + "epoch": 6.952506596306069, + "grad_norm": 1.2974623441696167, + "learning_rate": 0.0002, + "loss": 0.4232, + "step": 5270 + }, + { + "epoch": 6.965699208443271, + "grad_norm": 1.342126727104187, + "learning_rate": 0.0002, + "loss": 0.4126, + "step": 5280 + }, + { + "epoch": 6.978891820580475, + "grad_norm": 1.2475614547729492, + "learning_rate": 0.0002, + "loss": 0.4537, + "step": 5290 + }, + { + "epoch": 6.992084432717678, + "grad_norm": 1.254935622215271, + "learning_rate": 0.0002, + "loss": 0.456, + "step": 5300 + }, + { + "epoch": 7.0, + "eval_loss": 1.5579944849014282, + "eval_runtime": 71.7131, + "eval_samples_per_second": 6.01, + "eval_steps_per_second": 0.753, + "step": 5306 + } + ], + "logging_steps": 10, + "max_steps": 6064, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.4554979938480947e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0c0289389bcb2236bf804b0a82c50e67c11a1379 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-5306/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e04722e246164eebe6b4f70dd50fe946c936a2bd25edd75bcedcfd4c7ff9474e +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..672c8381911599cc53896a97a150fb1b26fe7240 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36ddbc72ccf15fc0e505c9297ea1d183d8ad0119ada897f511433f8f247cf626 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..23542db155e94cbc223af8dbec981aba2735f918 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:567d45079f2f1db9581139595be0b8ac595a87572a0412a56385825f4fc776ec +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f76faf9968648405a42ff88717b5bc04cdef82ef --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5578c2e197530fb47ddcdeba5ec95f6ffdb60b5806597bada019f37792813ae8 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..752e89f11d9e61bb7e1d340c6802f57275b5c6a5 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d9dc04f79bb371d7deefbb5d31d363d98b4768442c1ba0f1b683c80138e4b7a +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..23daf99964b001e889dc1f25a608775c965dee08 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/trainer_state.json @@ -0,0 +1,4339 @@ +{ + "best_metric": 1.1874967813491821, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516", + "epoch": 8.0, + "eval_steps": 10, + "global_step": 6064, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013192612137203167, + "grad_norm": 0.7545632123947144, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 10 + }, + { + "epoch": 0.026385224274406333, + "grad_norm": 0.5787661075592041, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 20 + }, + { + "epoch": 0.0395778364116095, + "grad_norm": 0.8616093993186951, + "learning_rate": 0.0002, + "loss": 1.3906, + "step": 30 + }, + { + "epoch": 0.052770448548812667, + "grad_norm": 0.42088547348976135, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 40 + }, + { + "epoch": 0.06596306068601583, + "grad_norm": 0.47704678773880005, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 50 + }, + { + "epoch": 0.079155672823219, + "grad_norm": 0.5763994455337524, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 60 + }, + { + "epoch": 0.09234828496042216, + "grad_norm": 0.4579846262931824, + "learning_rate": 0.0002, + "loss": 1.2268, + "step": 70 + }, + { + "epoch": 0.10554089709762533, + "grad_norm": 0.46623846888542175, + "learning_rate": 0.0002, + "loss": 1.415, + "step": 80 + }, + { + "epoch": 0.11873350923482849, + "grad_norm": 0.4206956624984741, + "learning_rate": 0.0002, + "loss": 1.2849, + "step": 90 + }, + { + "epoch": 0.13192612137203166, + "grad_norm": 0.41896629333496094, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 100 + }, + { + "epoch": 0.14511873350923482, + "grad_norm": 0.3459089398384094, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 110 + }, + { + "epoch": 0.158311345646438, + "grad_norm": 0.4587327837944031, + "learning_rate": 0.0002, + "loss": 1.2858, + "step": 120 + }, + { + "epoch": 0.17150395778364116, + "grad_norm": 0.433525413274765, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 130 + }, + { + "epoch": 0.18469656992084432, + "grad_norm": 0.39253175258636475, + "learning_rate": 0.0002, + "loss": 1.2269, + "step": 140 + }, + { + "epoch": 0.19788918205804748, + "grad_norm": 0.3602290749549866, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 150 + }, + { + "epoch": 0.21108179419525067, + "grad_norm": 0.41160839796066284, + "learning_rate": 0.0002, + "loss": 1.2834, + "step": 160 + }, + { + "epoch": 0.22427440633245382, + "grad_norm": 0.7213630080223083, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 170 + }, + { + "epoch": 0.23746701846965698, + "grad_norm": 0.39086055755615234, + "learning_rate": 0.0002, + "loss": 1.2215, + "step": 180 + }, + { + "epoch": 0.25065963060686014, + "grad_norm": 0.4465520977973938, + "learning_rate": 0.0002, + "loss": 1.3315, + "step": 190 + }, + { + "epoch": 0.2638522427440633, + "grad_norm": 1.814679741859436, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 200 + }, + { + "epoch": 0.2770448548812665, + "grad_norm": 0.5026423931121826, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 210 + }, + { + "epoch": 0.29023746701846964, + "grad_norm": 0.4156292974948883, + "learning_rate": 0.0002, + "loss": 1.3306, + "step": 220 + }, + { + "epoch": 0.3034300791556728, + "grad_norm": 0.40813493728637695, + "learning_rate": 0.0002, + "loss": 1.266, + "step": 230 + }, + { + "epoch": 0.316622691292876, + "grad_norm": 0.3304787874221802, + "learning_rate": 0.0002, + "loss": 1.1533, + "step": 240 + }, + { + "epoch": 0.32981530343007914, + "grad_norm": 0.46139976382255554, + "learning_rate": 0.0002, + "loss": 1.3154, + "step": 250 + }, + { + "epoch": 0.34300791556728233, + "grad_norm": 0.37518271803855896, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 260 + }, + { + "epoch": 0.3562005277044855, + "grad_norm": 0.35586467385292053, + "learning_rate": 0.0002, + "loss": 1.2004, + "step": 270 + }, + { + "epoch": 0.36939313984168864, + "grad_norm": 0.32441186904907227, + "learning_rate": 0.0002, + "loss": 1.2609, + "step": 280 + }, + { + "epoch": 0.38258575197889183, + "grad_norm": 0.3198683261871338, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 290 + }, + { + "epoch": 0.39577836411609496, + "grad_norm": 0.33663108944892883, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 300 + }, + { + "epoch": 0.40897097625329815, + "grad_norm": 0.3711244761943817, + "learning_rate": 0.0002, + "loss": 1.1702, + "step": 310 + }, + { + "epoch": 0.42216358839050133, + "grad_norm": 0.3209651708602905, + "learning_rate": 0.0002, + "loss": 1.0871, + "step": 320 + }, + { + "epoch": 0.43535620052770446, + "grad_norm": 0.5152716040611267, + "learning_rate": 0.0002, + "loss": 1.2728, + "step": 330 + }, + { + "epoch": 0.44854881266490765, + "grad_norm": 0.5431376695632935, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 340 + }, + { + "epoch": 0.46174142480211083, + "grad_norm": 0.3069997727870941, + "learning_rate": 0.0002, + "loss": 1.1344, + "step": 350 + }, + { + "epoch": 0.47493403693931396, + "grad_norm": 0.34260064363479614, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 360 + }, + { + "epoch": 0.48812664907651715, + "grad_norm": 0.345653235912323, + "learning_rate": 0.0002, + "loss": 1.3161, + "step": 370 + }, + { + "epoch": 0.5013192612137203, + "grad_norm": 0.46222734451293945, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 380 + }, + { + "epoch": 0.5145118733509235, + "grad_norm": 0.27301734685897827, + "learning_rate": 0.0002, + "loss": 1.0616, + "step": 390 + }, + { + "epoch": 0.5277044854881267, + "grad_norm": 0.29048439860343933, + "learning_rate": 0.0002, + "loss": 1.1944, + "step": 400 + }, + { + "epoch": 0.5408970976253298, + "grad_norm": 0.32927802205085754, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 410 + }, + { + "epoch": 0.554089709762533, + "grad_norm": 0.3336397409439087, + "learning_rate": 0.0002, + "loss": 1.2362, + "step": 420 + }, + { + "epoch": 0.5672823218997362, + "grad_norm": 0.4007597267627716, + "learning_rate": 0.0002, + "loss": 1.1445, + "step": 430 + }, + { + "epoch": 0.5804749340369393, + "grad_norm": 0.36144956946372986, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 440 + }, + { + "epoch": 0.5936675461741425, + "grad_norm": 0.6331009864807129, + "learning_rate": 0.0002, + "loss": 1.1604, + "step": 450 + }, + { + "epoch": 0.6068601583113457, + "grad_norm": 0.41469088196754456, + "learning_rate": 0.0002, + "loss": 1.1692, + "step": 460 + }, + { + "epoch": 0.6200527704485488, + "grad_norm": 0.4388185143470764, + "learning_rate": 0.0002, + "loss": 1.2372, + "step": 470 + }, + { + "epoch": 0.633245382585752, + "grad_norm": 0.3738141655921936, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 480 + }, + { + "epoch": 0.6464379947229552, + "grad_norm": 0.7212023138999939, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 490 + }, + { + "epoch": 0.6596306068601583, + "grad_norm": 0.2972351014614105, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 500 + }, + { + "epoch": 0.6728232189973615, + "grad_norm": 0.45293179154396057, + "learning_rate": 0.0002, + "loss": 1.1739, + "step": 510 + }, + { + "epoch": 0.6860158311345647, + "grad_norm": 0.4319860637187958, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 520 + }, + { + "epoch": 0.6992084432717678, + "grad_norm": 0.3050215542316437, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 530 + }, + { + "epoch": 0.712401055408971, + "grad_norm": 0.3552611172199249, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 540 + }, + { + "epoch": 0.7255936675461742, + "grad_norm": 0.3631151020526886, + "learning_rate": 0.0002, + "loss": 1.2546, + "step": 550 + }, + { + "epoch": 0.7387862796833773, + "grad_norm": 0.28177931904792786, + "learning_rate": 0.0002, + "loss": 1.1371, + "step": 560 + }, + { + "epoch": 0.7519788918205804, + "grad_norm": 0.359764039516449, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 570 + }, + { + "epoch": 0.7651715039577837, + "grad_norm": 0.3970327377319336, + "learning_rate": 0.0002, + "loss": 1.1356, + "step": 580 + }, + { + "epoch": 0.7783641160949868, + "grad_norm": 0.3541001081466675, + "learning_rate": 0.0002, + "loss": 1.309, + "step": 590 + }, + { + "epoch": 0.7915567282321899, + "grad_norm": 0.3478573262691498, + "learning_rate": 0.0002, + "loss": 1.2061, + "step": 600 + }, + { + "epoch": 0.8047493403693932, + "grad_norm": 0.3900321424007416, + "learning_rate": 0.0002, + "loss": 1.1864, + "step": 610 + }, + { + "epoch": 0.8179419525065963, + "grad_norm": 0.3443238437175751, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 620 + }, + { + "epoch": 0.8311345646437994, + "grad_norm": 0.44238781929016113, + "learning_rate": 0.0002, + "loss": 1.3232, + "step": 630 + }, + { + "epoch": 0.8443271767810027, + "grad_norm": 0.36339467763900757, + "learning_rate": 0.0002, + "loss": 1.1247, + "step": 640 + }, + { + "epoch": 0.8575197889182058, + "grad_norm": 0.6243070363998413, + "learning_rate": 0.0002, + "loss": 1.1049, + "step": 650 + }, + { + "epoch": 0.8707124010554089, + "grad_norm": 0.3209173381328583, + "learning_rate": 0.0002, + "loss": 1.1943, + "step": 660 + }, + { + "epoch": 0.8839050131926122, + "grad_norm": 0.35017991065979004, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 670 + }, + { + "epoch": 0.8970976253298153, + "grad_norm": 0.3247159421443939, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 680 + }, + { + "epoch": 0.9102902374670184, + "grad_norm": 0.4091894030570984, + "learning_rate": 0.0002, + "loss": 1.2298, + "step": 690 + }, + { + "epoch": 0.9234828496042217, + "grad_norm": 0.3975585997104645, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 700 + }, + { + "epoch": 0.9366754617414248, + "grad_norm": 0.3666245937347412, + "learning_rate": 0.0002, + "loss": 1.281, + "step": 710 + }, + { + "epoch": 0.9498680738786279, + "grad_norm": 0.45216917991638184, + "learning_rate": 0.0002, + "loss": 1.2855, + "step": 720 + }, + { + "epoch": 0.9630606860158312, + "grad_norm": 0.36108118295669556, + "learning_rate": 0.0002, + "loss": 1.2217, + "step": 730 + }, + { + "epoch": 0.9762532981530343, + "grad_norm": 0.44550251960754395, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 740 + }, + { + "epoch": 0.9894459102902374, + "grad_norm": 0.29801255464553833, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 750 + }, + { + "epoch": 1.0, + "eval_loss": 1.203244686126709, + "eval_runtime": 76.0457, + "eval_samples_per_second": 5.668, + "eval_steps_per_second": 0.71, + "step": 758 + }, + { + "epoch": 1.0026385224274406, + "grad_norm": 0.4096551239490509, + "learning_rate": 0.0002, + "loss": 1.0748, + "step": 760 + }, + { + "epoch": 1.0158311345646438, + "grad_norm": 0.2649582326412201, + "learning_rate": 0.0002, + "loss": 1.1537, + "step": 770 + }, + { + "epoch": 1.029023746701847, + "grad_norm": 0.3100722134113312, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 780 + }, + { + "epoch": 1.04221635883905, + "grad_norm": 0.3911755383014679, + "learning_rate": 0.0002, + "loss": 1.1736, + "step": 790 + }, + { + "epoch": 1.0554089709762533, + "grad_norm": 0.4600953757762909, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 800 + }, + { + "epoch": 1.0686015831134565, + "grad_norm": 0.28671619296073914, + "learning_rate": 0.0002, + "loss": 1.2869, + "step": 810 + }, + { + "epoch": 1.0817941952506596, + "grad_norm": 0.47282642126083374, + "learning_rate": 0.0002, + "loss": 1.1441, + "step": 820 + }, + { + "epoch": 1.0949868073878628, + "grad_norm": 0.690073549747467, + "learning_rate": 0.0002, + "loss": 1.1368, + "step": 830 + }, + { + "epoch": 1.108179419525066, + "grad_norm": 0.7317902445793152, + "learning_rate": 0.0002, + "loss": 0.9944, + "step": 840 + }, + { + "epoch": 1.121372031662269, + "grad_norm": 0.44215938448905945, + "learning_rate": 0.0002, + "loss": 1.108, + "step": 850 + }, + { + "epoch": 1.1345646437994723, + "grad_norm": 0.33875149488449097, + "learning_rate": 0.0002, + "loss": 1.0558, + "step": 860 + }, + { + "epoch": 1.1477572559366755, + "grad_norm": 0.3700002431869507, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 870 + }, + { + "epoch": 1.1609498680738786, + "grad_norm": 0.41173291206359863, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 880 + }, + { + "epoch": 1.1741424802110818, + "grad_norm": 0.5253589749336243, + "learning_rate": 0.0002, + "loss": 1.0312, + "step": 890 + }, + { + "epoch": 1.187335092348285, + "grad_norm": 0.3912237286567688, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 900 + }, + { + "epoch": 1.200527704485488, + "grad_norm": 0.40990331768989563, + "learning_rate": 0.0002, + "loss": 0.9434, + "step": 910 + }, + { + "epoch": 1.2137203166226913, + "grad_norm": 0.40377968549728394, + "learning_rate": 0.0002, + "loss": 1.0493, + "step": 920 + }, + { + "epoch": 1.2269129287598945, + "grad_norm": 0.4605846405029297, + "learning_rate": 0.0002, + "loss": 1.1496, + "step": 930 + }, + { + "epoch": 1.2401055408970976, + "grad_norm": 0.31564897298812866, + "learning_rate": 0.0002, + "loss": 1.1173, + "step": 940 + }, + { + "epoch": 1.2532981530343008, + "grad_norm": 0.39808550477027893, + "learning_rate": 0.0002, + "loss": 1.0547, + "step": 950 + }, + { + "epoch": 1.266490765171504, + "grad_norm": 0.3762115240097046, + "learning_rate": 0.0002, + "loss": 1.1367, + "step": 960 + }, + { + "epoch": 1.279683377308707, + "grad_norm": 0.4174984097480774, + "learning_rate": 0.0002, + "loss": 1.1596, + "step": 970 + }, + { + "epoch": 1.2928759894459103, + "grad_norm": 0.5263054966926575, + "learning_rate": 0.0002, + "loss": 1.1327, + "step": 980 + }, + { + "epoch": 1.3060686015831133, + "grad_norm": 0.41673699021339417, + "learning_rate": 0.0002, + "loss": 1.0339, + "step": 990 + }, + { + "epoch": 1.3192612137203166, + "grad_norm": 0.9613684415817261, + "learning_rate": 0.0002, + "loss": 1.1198, + "step": 1000 + }, + { + "epoch": 1.3324538258575198, + "grad_norm": 0.3690216839313507, + "learning_rate": 0.0002, + "loss": 1.0444, + "step": 1010 + }, + { + "epoch": 1.345646437994723, + "grad_norm": 0.521821141242981, + "learning_rate": 0.0002, + "loss": 1.0473, + "step": 1020 + }, + { + "epoch": 1.358839050131926, + "grad_norm": 0.3353094160556793, + "learning_rate": 0.0002, + "loss": 1.1065, + "step": 1030 + }, + { + "epoch": 1.3720316622691293, + "grad_norm": 0.3843843936920166, + "learning_rate": 0.0002, + "loss": 1.1286, + "step": 1040 + }, + { + "epoch": 1.3852242744063323, + "grad_norm": 0.372514545917511, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 1050 + }, + { + "epoch": 1.3984168865435356, + "grad_norm": 0.34537771344184875, + "learning_rate": 0.0002, + "loss": 1.0041, + "step": 1060 + }, + { + "epoch": 1.4116094986807388, + "grad_norm": 0.45349085330963135, + "learning_rate": 0.0002, + "loss": 1.0251, + "step": 1070 + }, + { + "epoch": 1.424802110817942, + "grad_norm": 0.5120177268981934, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 1080 + }, + { + "epoch": 1.437994722955145, + "grad_norm": 0.42800238728523254, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 1090 + }, + { + "epoch": 1.4511873350923483, + "grad_norm": 0.343832790851593, + "learning_rate": 0.0002, + "loss": 1.0734, + "step": 1100 + }, + { + "epoch": 1.4643799472295513, + "grad_norm": 0.3829841911792755, + "learning_rate": 0.0002, + "loss": 1.1286, + "step": 1110 + }, + { + "epoch": 1.4775725593667546, + "grad_norm": 0.4289931058883667, + "learning_rate": 0.0002, + "loss": 1.1268, + "step": 1120 + }, + { + "epoch": 1.4907651715039578, + "grad_norm": 0.42750850319862366, + "learning_rate": 0.0002, + "loss": 1.0676, + "step": 1130 + }, + { + "epoch": 1.503957783641161, + "grad_norm": 0.34328413009643555, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 1140 + }, + { + "epoch": 1.517150395778364, + "grad_norm": 0.349096417427063, + "learning_rate": 0.0002, + "loss": 1.0863, + "step": 1150 + }, + { + "epoch": 1.5303430079155673, + "grad_norm": 0.7700717449188232, + "learning_rate": 0.0002, + "loss": 1.1462, + "step": 1160 + }, + { + "epoch": 1.5435356200527703, + "grad_norm": 0.39294949173927307, + "learning_rate": 0.0002, + "loss": 0.9528, + "step": 1170 + }, + { + "epoch": 1.5567282321899736, + "grad_norm": 0.36173608899116516, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 1180 + }, + { + "epoch": 1.5699208443271768, + "grad_norm": 0.6034277677536011, + "learning_rate": 0.0002, + "loss": 1.1508, + "step": 1190 + }, + { + "epoch": 1.58311345646438, + "grad_norm": 0.36694103479385376, + "learning_rate": 0.0002, + "loss": 1.0105, + "step": 1200 + }, + { + "epoch": 1.596306068601583, + "grad_norm": 0.4727209508419037, + "learning_rate": 0.0002, + "loss": 1.1479, + "step": 1210 + }, + { + "epoch": 1.6094986807387863, + "grad_norm": 0.6482883095741272, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 1220 + }, + { + "epoch": 1.6226912928759893, + "grad_norm": 0.5238035917282104, + "learning_rate": 0.0002, + "loss": 1.1405, + "step": 1230 + }, + { + "epoch": 1.6358839050131926, + "grad_norm": 0.4812222421169281, + "learning_rate": 0.0002, + "loss": 1.0596, + "step": 1240 + }, + { + "epoch": 1.6490765171503958, + "grad_norm": 0.7131702303886414, + "learning_rate": 0.0002, + "loss": 1.2729, + "step": 1250 + }, + { + "epoch": 1.662269129287599, + "grad_norm": 0.3803327977657318, + "learning_rate": 0.0002, + "loss": 0.9832, + "step": 1260 + }, + { + "epoch": 1.675461741424802, + "grad_norm": 0.3745088577270508, + "learning_rate": 0.0002, + "loss": 1.1433, + "step": 1270 + }, + { + "epoch": 1.6886543535620053, + "grad_norm": 0.4427378475666046, + "learning_rate": 0.0002, + "loss": 1.1018, + "step": 1280 + }, + { + "epoch": 1.7018469656992083, + "grad_norm": 0.797478973865509, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 1290 + }, + { + "epoch": 1.7150395778364116, + "grad_norm": 0.503620982170105, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 1300 + }, + { + "epoch": 1.7282321899736148, + "grad_norm": 0.4132426381111145, + "learning_rate": 0.0002, + "loss": 1.1324, + "step": 1310 + }, + { + "epoch": 1.741424802110818, + "grad_norm": 0.41811656951904297, + "learning_rate": 0.0002, + "loss": 1.172, + "step": 1320 + }, + { + "epoch": 1.754617414248021, + "grad_norm": 0.40647849440574646, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1330 + }, + { + "epoch": 1.767810026385224, + "grad_norm": 0.42138347029685974, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 1340 + }, + { + "epoch": 1.7810026385224274, + "grad_norm": 0.46523579955101013, + "learning_rate": 0.0002, + "loss": 0.9955, + "step": 1350 + }, + { + "epoch": 1.7941952506596306, + "grad_norm": 0.39760419726371765, + "learning_rate": 0.0002, + "loss": 1.0307, + "step": 1360 + }, + { + "epoch": 1.8073878627968338, + "grad_norm": 0.37993717193603516, + "learning_rate": 0.0002, + "loss": 1.1218, + "step": 1370 + }, + { + "epoch": 1.820580474934037, + "grad_norm": 0.5404181480407715, + "learning_rate": 0.0002, + "loss": 1.0921, + "step": 1380 + }, + { + "epoch": 1.83377308707124, + "grad_norm": 0.4385245740413666, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1390 + }, + { + "epoch": 1.8469656992084431, + "grad_norm": 0.529797375202179, + "learning_rate": 0.0002, + "loss": 1.1175, + "step": 1400 + }, + { + "epoch": 1.8601583113456464, + "grad_norm": 0.481567919254303, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 1410 + }, + { + "epoch": 1.8733509234828496, + "grad_norm": 0.34787362813949585, + "learning_rate": 0.0002, + "loss": 1.0581, + "step": 1420 + }, + { + "epoch": 1.8865435356200528, + "grad_norm": 0.6402362585067749, + "learning_rate": 0.0002, + "loss": 1.0536, + "step": 1430 + }, + { + "epoch": 1.899736147757256, + "grad_norm": 0.3461322784423828, + "learning_rate": 0.0002, + "loss": 1.0787, + "step": 1440 + }, + { + "epoch": 1.912928759894459, + "grad_norm": 0.44005653262138367, + "learning_rate": 0.0002, + "loss": 1.0925, + "step": 1450 + }, + { + "epoch": 1.9261213720316621, + "grad_norm": 0.4064280688762665, + "learning_rate": 0.0002, + "loss": 1.0414, + "step": 1460 + }, + { + "epoch": 1.9393139841688654, + "grad_norm": 0.5236523151397705, + "learning_rate": 0.0002, + "loss": 1.0608, + "step": 1470 + }, + { + "epoch": 1.9525065963060686, + "grad_norm": 0.41030219197273254, + "learning_rate": 0.0002, + "loss": 1.0572, + "step": 1480 + }, + { + "epoch": 1.9656992084432718, + "grad_norm": 0.39805835485458374, + "learning_rate": 0.0002, + "loss": 1.1204, + "step": 1490 + }, + { + "epoch": 1.978891820580475, + "grad_norm": 0.42974501848220825, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 1500 + }, + { + "epoch": 1.992084432717678, + "grad_norm": 0.4688243865966797, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 1510 + }, + { + "epoch": 2.0, + "eval_loss": 1.1874967813491821, + "eval_runtime": 71.9523, + "eval_samples_per_second": 5.99, + "eval_steps_per_second": 0.75, + "step": 1516 + }, + { + "epoch": 2.005277044854881, + "grad_norm": 0.4121631383895874, + "learning_rate": 0.0002, + "loss": 1.0401, + "step": 1520 + }, + { + "epoch": 2.0184696569920844, + "grad_norm": 0.4844197928905487, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 1530 + }, + { + "epoch": 2.0316622691292876, + "grad_norm": 0.45408546924591064, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 1540 + }, + { + "epoch": 2.044854881266491, + "grad_norm": 0.48662951588630676, + "learning_rate": 0.0002, + "loss": 1.0399, + "step": 1550 + }, + { + "epoch": 2.058047493403694, + "grad_norm": 0.7195899486541748, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 1560 + }, + { + "epoch": 2.0712401055408973, + "grad_norm": 0.5071077346801758, + "learning_rate": 0.0002, + "loss": 0.9486, + "step": 1570 + }, + { + "epoch": 2.0844327176781, + "grad_norm": 0.7473958730697632, + "learning_rate": 0.0002, + "loss": 1.0055, + "step": 1580 + }, + { + "epoch": 2.0976253298153034, + "grad_norm": 0.5509232878684998, + "learning_rate": 0.0002, + "loss": 0.8309, + "step": 1590 + }, + { + "epoch": 2.1108179419525066, + "grad_norm": 0.5108042359352112, + "learning_rate": 0.0002, + "loss": 0.9181, + "step": 1600 + }, + { + "epoch": 2.12401055408971, + "grad_norm": 0.42331448197364807, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 1610 + }, + { + "epoch": 2.137203166226913, + "grad_norm": 0.46621623635292053, + "learning_rate": 0.0002, + "loss": 0.9359, + "step": 1620 + }, + { + "epoch": 2.150395778364116, + "grad_norm": 0.43802836537361145, + "learning_rate": 0.0002, + "loss": 0.9065, + "step": 1630 + }, + { + "epoch": 2.163588390501319, + "grad_norm": 0.49908021092414856, + "learning_rate": 0.0002, + "loss": 0.9375, + "step": 1640 + }, + { + "epoch": 2.1767810026385224, + "grad_norm": 0.4195636808872223, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 1650 + }, + { + "epoch": 2.1899736147757256, + "grad_norm": 0.49515822529792786, + "learning_rate": 0.0002, + "loss": 0.8501, + "step": 1660 + }, + { + "epoch": 2.203166226912929, + "grad_norm": 0.4607589542865753, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 1670 + }, + { + "epoch": 2.216358839050132, + "grad_norm": 0.4489196836948395, + "learning_rate": 0.0002, + "loss": 1.0206, + "step": 1680 + }, + { + "epoch": 2.229551451187335, + "grad_norm": 0.49300864338874817, + "learning_rate": 0.0002, + "loss": 0.9657, + "step": 1690 + }, + { + "epoch": 2.242744063324538, + "grad_norm": 0.6624954342842102, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 1700 + }, + { + "epoch": 2.2559366754617414, + "grad_norm": 0.8391500115394592, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 1710 + }, + { + "epoch": 2.2691292875989446, + "grad_norm": 0.5193073749542236, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 1720 + }, + { + "epoch": 2.282321899736148, + "grad_norm": 0.6180613040924072, + "learning_rate": 0.0002, + "loss": 0.9979, + "step": 1730 + }, + { + "epoch": 2.295514511873351, + "grad_norm": 0.591191291809082, + "learning_rate": 0.0002, + "loss": 0.9579, + "step": 1740 + }, + { + "epoch": 2.308707124010554, + "grad_norm": 0.546897828578949, + "learning_rate": 0.0002, + "loss": 0.8879, + "step": 1750 + }, + { + "epoch": 2.321899736147757, + "grad_norm": 0.5470401644706726, + "learning_rate": 0.0002, + "loss": 0.9321, + "step": 1760 + }, + { + "epoch": 2.3350923482849604, + "grad_norm": 0.4590282738208771, + "learning_rate": 0.0002, + "loss": 0.9104, + "step": 1770 + }, + { + "epoch": 2.3482849604221636, + "grad_norm": 0.622164785861969, + "learning_rate": 0.0002, + "loss": 1.0384, + "step": 1780 + }, + { + "epoch": 2.361477572559367, + "grad_norm": 0.5753812193870544, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 1790 + }, + { + "epoch": 2.37467018469657, + "grad_norm": 0.47958624362945557, + "learning_rate": 0.0002, + "loss": 0.9569, + "step": 1800 + }, + { + "epoch": 2.387862796833773, + "grad_norm": 0.48042672872543335, + "learning_rate": 0.0002, + "loss": 1.0519, + "step": 1810 + }, + { + "epoch": 2.401055408970976, + "grad_norm": 0.44586366415023804, + "learning_rate": 0.0002, + "loss": 0.9042, + "step": 1820 + }, + { + "epoch": 2.4142480211081794, + "grad_norm": 0.7239416837692261, + "learning_rate": 0.0002, + "loss": 0.9783, + "step": 1830 + }, + { + "epoch": 2.4274406332453826, + "grad_norm": 0.5515341758728027, + "learning_rate": 0.0002, + "loss": 0.8818, + "step": 1840 + }, + { + "epoch": 2.440633245382586, + "grad_norm": 0.6280064582824707, + "learning_rate": 0.0002, + "loss": 0.9503, + "step": 1850 + }, + { + "epoch": 2.453825857519789, + "grad_norm": 0.4832057058811188, + "learning_rate": 0.0002, + "loss": 0.8943, + "step": 1860 + }, + { + "epoch": 2.467018469656992, + "grad_norm": 0.5789321064949036, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 1870 + }, + { + "epoch": 2.480211081794195, + "grad_norm": 0.48491886258125305, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 1880 + }, + { + "epoch": 2.4934036939313984, + "grad_norm": 0.532365620136261, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 1890 + }, + { + "epoch": 2.5065963060686016, + "grad_norm": 0.7087852954864502, + "learning_rate": 0.0002, + "loss": 0.9734, + "step": 1900 + }, + { + "epoch": 2.519788918205805, + "grad_norm": 0.48157402873039246, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 1910 + }, + { + "epoch": 2.532981530343008, + "grad_norm": 0.5886041522026062, + "learning_rate": 0.0002, + "loss": 0.9851, + "step": 1920 + }, + { + "epoch": 2.5461741424802113, + "grad_norm": 0.6332622766494751, + "learning_rate": 0.0002, + "loss": 1.0144, + "step": 1930 + }, + { + "epoch": 2.559366754617414, + "grad_norm": 0.5463117957115173, + "learning_rate": 0.0002, + "loss": 0.9516, + "step": 1940 + }, + { + "epoch": 2.5725593667546174, + "grad_norm": 0.5432228446006775, + "learning_rate": 0.0002, + "loss": 0.9373, + "step": 1950 + }, + { + "epoch": 2.5857519788918206, + "grad_norm": 0.5929186940193176, + "learning_rate": 0.0002, + "loss": 0.8974, + "step": 1960 + }, + { + "epoch": 2.598944591029024, + "grad_norm": 0.5120641589164734, + "learning_rate": 0.0002, + "loss": 1.0062, + "step": 1970 + }, + { + "epoch": 2.6121372031662267, + "grad_norm": 0.5372339487075806, + "learning_rate": 0.0002, + "loss": 0.9143, + "step": 1980 + }, + { + "epoch": 2.62532981530343, + "grad_norm": 0.5519838929176331, + "learning_rate": 0.0002, + "loss": 1.0786, + "step": 1990 + }, + { + "epoch": 2.638522427440633, + "grad_norm": 0.7304037809371948, + "learning_rate": 0.0002, + "loss": 1.021, + "step": 2000 + }, + { + "epoch": 2.6517150395778364, + "grad_norm": 0.6182340979576111, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 2010 + }, + { + "epoch": 2.6649076517150396, + "grad_norm": 0.4874444305896759, + "learning_rate": 0.0002, + "loss": 1.0501, + "step": 2020 + }, + { + "epoch": 2.678100263852243, + "grad_norm": 0.5850239396095276, + "learning_rate": 0.0002, + "loss": 0.9612, + "step": 2030 + }, + { + "epoch": 2.691292875989446, + "grad_norm": 0.6495311856269836, + "learning_rate": 0.0002, + "loss": 0.9808, + "step": 2040 + }, + { + "epoch": 2.7044854881266494, + "grad_norm": 1.002830147743225, + "learning_rate": 0.0002, + "loss": 1.0187, + "step": 2050 + }, + { + "epoch": 2.717678100263852, + "grad_norm": 0.49076753854751587, + "learning_rate": 0.0002, + "loss": 1.0386, + "step": 2060 + }, + { + "epoch": 2.7308707124010554, + "grad_norm": 0.4736326336860657, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 2070 + }, + { + "epoch": 2.7440633245382586, + "grad_norm": 0.5527601838111877, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 2080 + }, + { + "epoch": 2.757255936675462, + "grad_norm": 0.7295718193054199, + "learning_rate": 0.0002, + "loss": 0.9149, + "step": 2090 + }, + { + "epoch": 2.7704485488126647, + "grad_norm": 0.5437536835670471, + "learning_rate": 0.0002, + "loss": 0.9032, + "step": 2100 + }, + { + "epoch": 2.783641160949868, + "grad_norm": 0.5997128486633301, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2110 + }, + { + "epoch": 2.796833773087071, + "grad_norm": 0.6498191356658936, + "learning_rate": 0.0002, + "loss": 0.976, + "step": 2120 + }, + { + "epoch": 2.8100263852242744, + "grad_norm": 0.5237268805503845, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 2130 + }, + { + "epoch": 2.8232189973614776, + "grad_norm": 0.6033027172088623, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 2140 + }, + { + "epoch": 2.836411609498681, + "grad_norm": 0.6077138781547546, + "learning_rate": 0.0002, + "loss": 0.9625, + "step": 2150 + }, + { + "epoch": 2.849604221635884, + "grad_norm": 0.4127797484397888, + "learning_rate": 0.0002, + "loss": 0.9347, + "step": 2160 + }, + { + "epoch": 2.862796833773087, + "grad_norm": 0.8448635339736938, + "learning_rate": 0.0002, + "loss": 1.0459, + "step": 2170 + }, + { + "epoch": 2.87598944591029, + "grad_norm": 0.5669729113578796, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 2180 + }, + { + "epoch": 2.8891820580474934, + "grad_norm": 0.510231077671051, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 2190 + }, + { + "epoch": 2.9023746701846966, + "grad_norm": 0.8072245121002197, + "learning_rate": 0.0002, + "loss": 0.9267, + "step": 2200 + }, + { + "epoch": 2.9155672823219, + "grad_norm": 0.6055923104286194, + "learning_rate": 0.0002, + "loss": 0.9685, + "step": 2210 + }, + { + "epoch": 2.9287598944591027, + "grad_norm": 0.7384416460990906, + "learning_rate": 0.0002, + "loss": 0.9157, + "step": 2220 + }, + { + "epoch": 2.941952506596306, + "grad_norm": 0.4922751784324646, + "learning_rate": 0.0002, + "loss": 0.9712, + "step": 2230 + }, + { + "epoch": 2.955145118733509, + "grad_norm": 0.6039906740188599, + "learning_rate": 0.0002, + "loss": 0.9487, + "step": 2240 + }, + { + "epoch": 2.9683377308707124, + "grad_norm": 0.4751701354980469, + "learning_rate": 0.0002, + "loss": 0.9776, + "step": 2250 + }, + { + "epoch": 2.9815303430079156, + "grad_norm": 0.5698353052139282, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 2260 + }, + { + "epoch": 2.994722955145119, + "grad_norm": 0.893563449382782, + "learning_rate": 0.0002, + "loss": 1.1184, + "step": 2270 + }, + { + "epoch": 3.0, + "eval_loss": 1.2046419382095337, + "eval_runtime": 71.5992, + "eval_samples_per_second": 6.02, + "eval_steps_per_second": 0.754, + "step": 2274 + }, + { + "epoch": 3.007915567282322, + "grad_norm": 0.41119325160980225, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 2280 + }, + { + "epoch": 3.021108179419525, + "grad_norm": 0.8169420957565308, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 2290 + }, + { + "epoch": 3.034300791556728, + "grad_norm": 0.6033818125724792, + "learning_rate": 0.0002, + "loss": 0.794, + "step": 2300 + }, + { + "epoch": 3.0474934036939314, + "grad_norm": 0.9600058197975159, + "learning_rate": 0.0002, + "loss": 0.7607, + "step": 2310 + }, + { + "epoch": 3.0606860158311346, + "grad_norm": 0.5859250426292419, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 2320 + }, + { + "epoch": 3.073878627968338, + "grad_norm": 0.6758618950843811, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 2330 + }, + { + "epoch": 3.0870712401055407, + "grad_norm": 0.8407140970230103, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 2340 + }, + { + "epoch": 3.100263852242744, + "grad_norm": 0.767779529094696, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 2350 + }, + { + "epoch": 3.113456464379947, + "grad_norm": 0.5572896599769592, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 2360 + }, + { + "epoch": 3.1266490765171504, + "grad_norm": 0.5908368825912476, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 2370 + }, + { + "epoch": 3.1398416886543536, + "grad_norm": 0.8047826290130615, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 2380 + }, + { + "epoch": 3.153034300791557, + "grad_norm": 0.8041718006134033, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 2390 + }, + { + "epoch": 3.16622691292876, + "grad_norm": 0.57078617811203, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 2400 + }, + { + "epoch": 3.179419525065963, + "grad_norm": 0.5125322937965393, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 2410 + }, + { + "epoch": 3.192612137203166, + "grad_norm": 0.6356934309005737, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 2420 + }, + { + "epoch": 3.2058047493403694, + "grad_norm": 1.0129680633544922, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 2430 + }, + { + "epoch": 3.2189973614775726, + "grad_norm": 0.8104226589202881, + "learning_rate": 0.0002, + "loss": 0.8316, + "step": 2440 + }, + { + "epoch": 3.232189973614776, + "grad_norm": 0.7276079058647156, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 2450 + }, + { + "epoch": 3.2453825857519787, + "grad_norm": 0.9753884077072144, + "learning_rate": 0.0002, + "loss": 0.8183, + "step": 2460 + }, + { + "epoch": 3.258575197889182, + "grad_norm": 0.9753183722496033, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 2470 + }, + { + "epoch": 3.271767810026385, + "grad_norm": 0.6791225075721741, + "learning_rate": 0.0002, + "loss": 0.8815, + "step": 2480 + }, + { + "epoch": 3.2849604221635884, + "grad_norm": 0.6797150373458862, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 2490 + }, + { + "epoch": 3.2981530343007917, + "grad_norm": 0.8107194900512695, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 2500 + }, + { + "epoch": 3.311345646437995, + "grad_norm": 0.5878375172615051, + "learning_rate": 0.0002, + "loss": 0.7869, + "step": 2510 + }, + { + "epoch": 3.324538258575198, + "grad_norm": 0.5882975459098816, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 2520 + }, + { + "epoch": 3.337730870712401, + "grad_norm": 0.6180013418197632, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 2530 + }, + { + "epoch": 3.350923482849604, + "grad_norm": 1.0008151531219482, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 2540 + }, + { + "epoch": 3.3641160949868074, + "grad_norm": 0.6404656767845154, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 2550 + }, + { + "epoch": 3.3773087071240107, + "grad_norm": 0.8481354117393494, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 2560 + }, + { + "epoch": 3.390501319261214, + "grad_norm": 0.8068035244941711, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 2570 + }, + { + "epoch": 3.4036939313984167, + "grad_norm": 0.7477166056632996, + "learning_rate": 0.0002, + "loss": 0.8621, + "step": 2580 + }, + { + "epoch": 3.41688654353562, + "grad_norm": 0.6202635765075684, + "learning_rate": 0.0002, + "loss": 0.8352, + "step": 2590 + }, + { + "epoch": 3.430079155672823, + "grad_norm": 0.6981159448623657, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 2600 + }, + { + "epoch": 3.4432717678100264, + "grad_norm": 0.6611084342002869, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 2610 + }, + { + "epoch": 3.4564643799472297, + "grad_norm": 0.5727696418762207, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 2620 + }, + { + "epoch": 3.469656992084433, + "grad_norm": 1.2354545593261719, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 2630 + }, + { + "epoch": 3.4828496042216357, + "grad_norm": 0.6347638368606567, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 2640 + }, + { + "epoch": 3.496042216358839, + "grad_norm": 0.6975704431533813, + "learning_rate": 0.0002, + "loss": 0.8426, + "step": 2650 + }, + { + "epoch": 3.509234828496042, + "grad_norm": 0.6569573879241943, + "learning_rate": 0.0002, + "loss": 0.8773, + "step": 2660 + }, + { + "epoch": 3.5224274406332454, + "grad_norm": 0.6979609131813049, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 2670 + }, + { + "epoch": 3.5356200527704487, + "grad_norm": 0.6287988424301147, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2680 + }, + { + "epoch": 3.5488126649076515, + "grad_norm": 0.8682637214660645, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 2690 + }, + { + "epoch": 3.5620052770448547, + "grad_norm": 0.7062831521034241, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 2700 + }, + { + "epoch": 3.575197889182058, + "grad_norm": 1.0061452388763428, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 2710 + }, + { + "epoch": 3.588390501319261, + "grad_norm": 0.719097375869751, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 2720 + }, + { + "epoch": 3.6015831134564644, + "grad_norm": 0.7583496570587158, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 2730 + }, + { + "epoch": 3.6147757255936677, + "grad_norm": 0.7543531060218811, + "learning_rate": 0.0002, + "loss": 0.91, + "step": 2740 + }, + { + "epoch": 3.627968337730871, + "grad_norm": 0.8873646855354309, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2750 + }, + { + "epoch": 3.641160949868074, + "grad_norm": 1.0657562017440796, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 2760 + }, + { + "epoch": 3.654353562005277, + "grad_norm": 0.8641113638877869, + "learning_rate": 0.0002, + "loss": 0.8291, + "step": 2770 + }, + { + "epoch": 3.66754617414248, + "grad_norm": 0.6620645523071289, + "learning_rate": 0.0002, + "loss": 0.8302, + "step": 2780 + }, + { + "epoch": 3.6807387862796834, + "grad_norm": 0.6919541954994202, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 2790 + }, + { + "epoch": 3.6939313984168867, + "grad_norm": 0.7305743098258972, + "learning_rate": 0.0002, + "loss": 0.8388, + "step": 2800 + }, + { + "epoch": 3.7071240105540895, + "grad_norm": 0.7464777827262878, + "learning_rate": 0.0002, + "loss": 0.8053, + "step": 2810 + }, + { + "epoch": 3.7203166226912927, + "grad_norm": 0.8067063093185425, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 2820 + }, + { + "epoch": 3.733509234828496, + "grad_norm": 0.7789416313171387, + "learning_rate": 0.0002, + "loss": 0.8259, + "step": 2830 + }, + { + "epoch": 3.746701846965699, + "grad_norm": 0.507529079914093, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 2840 + }, + { + "epoch": 3.7598944591029024, + "grad_norm": 0.6509260535240173, + "learning_rate": 0.0002, + "loss": 0.832, + "step": 2850 + }, + { + "epoch": 3.7730870712401057, + "grad_norm": 0.9141367673873901, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 2860 + }, + { + "epoch": 3.786279683377309, + "grad_norm": 0.7852635979652405, + "learning_rate": 0.0002, + "loss": 0.9436, + "step": 2870 + }, + { + "epoch": 3.7994722955145117, + "grad_norm": 0.5340318083763123, + "learning_rate": 0.0002, + "loss": 0.8842, + "step": 2880 + }, + { + "epoch": 3.812664907651715, + "grad_norm": 0.6246042847633362, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 2890 + }, + { + "epoch": 3.825857519788918, + "grad_norm": 0.7064066529273987, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 2900 + }, + { + "epoch": 3.8390501319261214, + "grad_norm": 0.6144065856933594, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 2910 + }, + { + "epoch": 3.8522427440633247, + "grad_norm": 0.5268424153327942, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 2920 + }, + { + "epoch": 3.8654353562005275, + "grad_norm": 0.9508116841316223, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 2930 + }, + { + "epoch": 3.8786279683377307, + "grad_norm": 0.9133715629577637, + "learning_rate": 0.0002, + "loss": 0.8437, + "step": 2940 + }, + { + "epoch": 3.891820580474934, + "grad_norm": 1.0144646167755127, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 2950 + }, + { + "epoch": 3.905013192612137, + "grad_norm": 0.6397877931594849, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2960 + }, + { + "epoch": 3.9182058047493404, + "grad_norm": 0.734835147857666, + "learning_rate": 0.0002, + "loss": 0.8285, + "step": 2970 + }, + { + "epoch": 3.9313984168865437, + "grad_norm": 0.784853994846344, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 2980 + }, + { + "epoch": 3.944591029023747, + "grad_norm": 0.805831789970398, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 2990 + }, + { + "epoch": 3.9577836411609497, + "grad_norm": 0.6299595236778259, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 3000 + }, + { + "epoch": 3.970976253298153, + "grad_norm": 0.6264058351516724, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3010 + }, + { + "epoch": 3.984168865435356, + "grad_norm": 0.6419739723205566, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 3020 + }, + { + "epoch": 3.9973614775725594, + "grad_norm": 0.7737036943435669, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 3030 + }, + { + "epoch": 4.0, + "eval_loss": 1.2454297542572021, + "eval_runtime": 71.8558, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.752, + "step": 3032 + }, + { + "epoch": 4.010554089709762, + "grad_norm": 1.092727541923523, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 3040 + }, + { + "epoch": 4.0237467018469655, + "grad_norm": 0.8087759613990784, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 3050 + }, + { + "epoch": 4.036939313984169, + "grad_norm": 0.8106053471565247, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 3060 + }, + { + "epoch": 4.050131926121372, + "grad_norm": 0.8675326704978943, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 3070 + }, + { + "epoch": 4.063324538258575, + "grad_norm": 0.9620490074157715, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 3080 + }, + { + "epoch": 4.076517150395778, + "grad_norm": 0.8996296525001526, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 3090 + }, + { + "epoch": 4.089709762532982, + "grad_norm": 0.8648998737335205, + "learning_rate": 0.0002, + "loss": 0.6111, + "step": 3100 + }, + { + "epoch": 4.102902374670185, + "grad_norm": 1.0321335792541504, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 3110 + }, + { + "epoch": 4.116094986807388, + "grad_norm": 0.7949225306510925, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 3120 + }, + { + "epoch": 4.129287598944591, + "grad_norm": 0.9684646129608154, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 3130 + }, + { + "epoch": 4.142480211081795, + "grad_norm": 0.8698066473007202, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 3140 + }, + { + "epoch": 4.155672823218997, + "grad_norm": 0.7688450813293457, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 3150 + }, + { + "epoch": 4.1688654353562, + "grad_norm": 0.9682092070579529, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 3160 + }, + { + "epoch": 4.1820580474934035, + "grad_norm": 0.961561918258667, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 3170 + }, + { + "epoch": 4.195250659630607, + "grad_norm": 1.3962990045547485, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 3180 + }, + { + "epoch": 4.20844327176781, + "grad_norm": 0.9485045075416565, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 3190 + }, + { + "epoch": 4.221635883905013, + "grad_norm": 0.7768281698226929, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 3200 + }, + { + "epoch": 4.2348284960422165, + "grad_norm": 1.2685691118240356, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 3210 + }, + { + "epoch": 4.24802110817942, + "grad_norm": 0.6876471638679504, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 3220 + }, + { + "epoch": 4.261213720316623, + "grad_norm": 1.0074554681777954, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 3230 + }, + { + "epoch": 4.274406332453826, + "grad_norm": 0.8094777464866638, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 3240 + }, + { + "epoch": 4.287598944591029, + "grad_norm": 0.7906569242477417, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 3250 + }, + { + "epoch": 4.300791556728232, + "grad_norm": 0.840238630771637, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 3260 + }, + { + "epoch": 4.313984168865435, + "grad_norm": 1.0119295120239258, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 3270 + }, + { + "epoch": 4.327176781002638, + "grad_norm": 0.7943191528320312, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 3280 + }, + { + "epoch": 4.3403693931398415, + "grad_norm": 0.7691723704338074, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 3290 + }, + { + "epoch": 4.353562005277045, + "grad_norm": 0.7227770686149597, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 3300 + }, + { + "epoch": 4.366754617414248, + "grad_norm": 0.8512253165245056, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 3310 + }, + { + "epoch": 4.379947229551451, + "grad_norm": 0.7852529287338257, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 3320 + }, + { + "epoch": 4.3931398416886545, + "grad_norm": 0.8888797163963318, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 3330 + }, + { + "epoch": 4.406332453825858, + "grad_norm": 0.9522430896759033, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 3340 + }, + { + "epoch": 4.419525065963061, + "grad_norm": 0.900276780128479, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 3350 + }, + { + "epoch": 4.432717678100264, + "grad_norm": 1.181547999382019, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 3360 + }, + { + "epoch": 4.445910290237467, + "grad_norm": 0.903142511844635, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 3370 + }, + { + "epoch": 4.45910290237467, + "grad_norm": 0.8747565150260925, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 3380 + }, + { + "epoch": 4.472295514511873, + "grad_norm": 0.7838051319122314, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 3390 + }, + { + "epoch": 4.485488126649076, + "grad_norm": 0.8691313862800598, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 3400 + }, + { + "epoch": 4.4986807387862795, + "grad_norm": 0.8493868708610535, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 3410 + }, + { + "epoch": 4.511873350923483, + "grad_norm": 1.0104830265045166, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 3420 + }, + { + "epoch": 4.525065963060686, + "grad_norm": 1.1716967821121216, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 3430 + }, + { + "epoch": 4.538258575197889, + "grad_norm": 0.9122593998908997, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 3440 + }, + { + "epoch": 4.5514511873350925, + "grad_norm": 0.829090416431427, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 3450 + }, + { + "epoch": 4.564643799472296, + "grad_norm": 1.141662836074829, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 3460 + }, + { + "epoch": 4.577836411609499, + "grad_norm": 0.8423182368278503, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 3470 + }, + { + "epoch": 4.591029023746702, + "grad_norm": 0.8024184703826904, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 3480 + }, + { + "epoch": 4.6042216358839045, + "grad_norm": 0.7703381776809692, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 3490 + }, + { + "epoch": 4.617414248021108, + "grad_norm": 0.9883959293365479, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 3500 + }, + { + "epoch": 4.630606860158311, + "grad_norm": 0.9554709196090698, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 3510 + }, + { + "epoch": 4.643799472295514, + "grad_norm": 1.9949709177017212, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 3520 + }, + { + "epoch": 4.6569920844327175, + "grad_norm": 0.7762255072593689, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 3530 + }, + { + "epoch": 4.670184696569921, + "grad_norm": 0.9538425803184509, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 3540 + }, + { + "epoch": 4.683377308707124, + "grad_norm": 1.0279661417007446, + "learning_rate": 0.0002, + "loss": 0.6352, + "step": 3550 + }, + { + "epoch": 4.696569920844327, + "grad_norm": 0.7545472979545593, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 3560 + }, + { + "epoch": 4.7097625329815305, + "grad_norm": 0.8919376730918884, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 3570 + }, + { + "epoch": 4.722955145118734, + "grad_norm": 0.7621569633483887, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 3580 + }, + { + "epoch": 4.736147757255937, + "grad_norm": 1.205320119857788, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3590 + }, + { + "epoch": 4.74934036939314, + "grad_norm": 1.0642725229263306, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 3600 + }, + { + "epoch": 4.762532981530343, + "grad_norm": 0.9402666687965393, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 3610 + }, + { + "epoch": 4.775725593667546, + "grad_norm": 1.254127025604248, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 3620 + }, + { + "epoch": 4.788918205804749, + "grad_norm": 0.7609598636627197, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 3630 + }, + { + "epoch": 4.802110817941952, + "grad_norm": 0.8240329623222351, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 3640 + }, + { + "epoch": 4.8153034300791555, + "grad_norm": 0.8356260657310486, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 3650 + }, + { + "epoch": 4.828496042216359, + "grad_norm": 0.9130708575248718, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 3660 + }, + { + "epoch": 4.841688654353562, + "grad_norm": 0.9384765028953552, + "learning_rate": 0.0002, + "loss": 0.7269, + "step": 3670 + }, + { + "epoch": 4.854881266490765, + "grad_norm": 0.9829966425895691, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 3680 + }, + { + "epoch": 4.8680738786279685, + "grad_norm": 1.0488632917404175, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 3690 + }, + { + "epoch": 4.881266490765172, + "grad_norm": 1.2278969287872314, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 3700 + }, + { + "epoch": 4.894459102902375, + "grad_norm": 0.8078970313072205, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 3710 + }, + { + "epoch": 4.907651715039578, + "grad_norm": 0.8081700205802917, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 3720 + }, + { + "epoch": 4.9208443271767806, + "grad_norm": 0.9204511046409607, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 3730 + }, + { + "epoch": 4.934036939313984, + "grad_norm": 0.9326391220092773, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 3740 + }, + { + "epoch": 4.947229551451187, + "grad_norm": 1.0089969635009766, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 3750 + }, + { + "epoch": 4.96042216358839, + "grad_norm": 0.7063466906547546, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 3760 + }, + { + "epoch": 4.9736147757255935, + "grad_norm": 1.2603905200958252, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 3770 + }, + { + "epoch": 4.986807387862797, + "grad_norm": 0.8418653607368469, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 3780 + }, + { + "epoch": 5.0, + "grad_norm": 0.9537181854248047, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 3790 + }, + { + "epoch": 5.0, + "eval_loss": 1.3319307565689087, + "eval_runtime": 71.7836, + "eval_samples_per_second": 6.004, + "eval_steps_per_second": 0.752, + "step": 3790 + }, + { + "epoch": 5.013192612137203, + "grad_norm": 0.8595899343490601, + "learning_rate": 0.0002, + "loss": 0.489, + "step": 3800 + }, + { + "epoch": 5.0263852242744065, + "grad_norm": 1.0023565292358398, + "learning_rate": 0.0002, + "loss": 0.5155, + "step": 3810 + }, + { + "epoch": 5.03957783641161, + "grad_norm": 1.2770460844039917, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 3820 + }, + { + "epoch": 5.052770448548813, + "grad_norm": 1.1701956987380981, + "learning_rate": 0.0002, + "loss": 0.5127, + "step": 3830 + }, + { + "epoch": 5.065963060686016, + "grad_norm": 0.812269926071167, + "learning_rate": 0.0002, + "loss": 0.5057, + "step": 3840 + }, + { + "epoch": 5.0791556728232194, + "grad_norm": 0.8186697363853455, + "learning_rate": 0.0002, + "loss": 0.4292, + "step": 3850 + }, + { + "epoch": 5.092348284960422, + "grad_norm": 1.052565097808838, + "learning_rate": 0.0002, + "loss": 0.4865, + "step": 3860 + }, + { + "epoch": 5.105540897097625, + "grad_norm": 0.9764705300331116, + "learning_rate": 0.0002, + "loss": 0.4947, + "step": 3870 + }, + { + "epoch": 5.118733509234828, + "grad_norm": 0.6973426938056946, + "learning_rate": 0.0002, + "loss": 0.471, + "step": 3880 + }, + { + "epoch": 5.1319261213720315, + "grad_norm": 1.2127928733825684, + "learning_rate": 0.0002, + "loss": 0.5565, + "step": 3890 + }, + { + "epoch": 5.145118733509235, + "grad_norm": 0.682807981967926, + "learning_rate": 0.0002, + "loss": 0.4122, + "step": 3900 + }, + { + "epoch": 5.158311345646438, + "grad_norm": 1.3575998544692993, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 3910 + }, + { + "epoch": 5.171503957783641, + "grad_norm": 1.2581931352615356, + "learning_rate": 0.0002, + "loss": 0.4624, + "step": 3920 + }, + { + "epoch": 5.1846965699208445, + "grad_norm": 1.0493637323379517, + "learning_rate": 0.0002, + "loss": 0.5092, + "step": 3930 + }, + { + "epoch": 5.197889182058048, + "grad_norm": 1.3519670963287354, + "learning_rate": 0.0002, + "loss": 0.4563, + "step": 3940 + }, + { + "epoch": 5.211081794195251, + "grad_norm": 1.0690566301345825, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 3950 + }, + { + "epoch": 5.224274406332454, + "grad_norm": 1.1171330213546753, + "learning_rate": 0.0002, + "loss": 0.5038, + "step": 3960 + }, + { + "epoch": 5.237467018469657, + "grad_norm": 1.055851697921753, + "learning_rate": 0.0002, + "loss": 0.4397, + "step": 3970 + }, + { + "epoch": 5.25065963060686, + "grad_norm": 0.8870180249214172, + "learning_rate": 0.0002, + "loss": 0.4964, + "step": 3980 + }, + { + "epoch": 5.263852242744063, + "grad_norm": 0.9688402414321899, + "learning_rate": 0.0002, + "loss": 0.5353, + "step": 3990 + }, + { + "epoch": 5.277044854881266, + "grad_norm": 0.8458422422409058, + "learning_rate": 0.0002, + "loss": 0.5192, + "step": 4000 + }, + { + "epoch": 5.2902374670184695, + "grad_norm": 0.908256471157074, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 4010 + }, + { + "epoch": 5.303430079155673, + "grad_norm": 1.0058149099349976, + "learning_rate": 0.0002, + "loss": 0.5102, + "step": 4020 + }, + { + "epoch": 5.316622691292876, + "grad_norm": 1.20364511013031, + "learning_rate": 0.0002, + "loss": 0.5322, + "step": 4030 + }, + { + "epoch": 5.329815303430079, + "grad_norm": 1.0135732889175415, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 4040 + }, + { + "epoch": 5.3430079155672825, + "grad_norm": 1.1094907522201538, + "learning_rate": 0.0002, + "loss": 0.4736, + "step": 4050 + }, + { + "epoch": 5.356200527704486, + "grad_norm": 1.0373083353042603, + "learning_rate": 0.0002, + "loss": 0.4912, + "step": 4060 + }, + { + "epoch": 5.369393139841689, + "grad_norm": 1.0952966213226318, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 4070 + }, + { + "epoch": 5.382585751978892, + "grad_norm": 1.1734952926635742, + "learning_rate": 0.0002, + "loss": 0.4892, + "step": 4080 + }, + { + "epoch": 5.395778364116095, + "grad_norm": 0.8217245936393738, + "learning_rate": 0.0002, + "loss": 0.4463, + "step": 4090 + }, + { + "epoch": 5.408970976253298, + "grad_norm": 1.0936307907104492, + "learning_rate": 0.0002, + "loss": 0.5271, + "step": 4100 + }, + { + "epoch": 5.422163588390501, + "grad_norm": 1.0198720693588257, + "learning_rate": 0.0002, + "loss": 0.509, + "step": 4110 + }, + { + "epoch": 5.435356200527704, + "grad_norm": 1.1105809211730957, + "learning_rate": 0.0002, + "loss": 0.5265, + "step": 4120 + }, + { + "epoch": 5.4485488126649075, + "grad_norm": 1.1817213296890259, + "learning_rate": 0.0002, + "loss": 0.4871, + "step": 4130 + }, + { + "epoch": 5.461741424802111, + "grad_norm": 1.126339077949524, + "learning_rate": 0.0002, + "loss": 0.4987, + "step": 4140 + }, + { + "epoch": 5.474934036939314, + "grad_norm": 0.9467914700508118, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 4150 + }, + { + "epoch": 5.488126649076517, + "grad_norm": 1.0335774421691895, + "learning_rate": 0.0002, + "loss": 0.5386, + "step": 4160 + }, + { + "epoch": 5.5013192612137205, + "grad_norm": 0.866211473941803, + "learning_rate": 0.0002, + "loss": 0.5122, + "step": 4170 + }, + { + "epoch": 5.514511873350924, + "grad_norm": 0.7422948479652405, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 4180 + }, + { + "epoch": 5.527704485488127, + "grad_norm": 1.2211135625839233, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 4190 + }, + { + "epoch": 5.540897097625329, + "grad_norm": 1.0371766090393066, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 4200 + }, + { + "epoch": 5.554089709762533, + "grad_norm": 0.9460630416870117, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 4210 + }, + { + "epoch": 5.567282321899736, + "grad_norm": 0.7972197532653809, + "learning_rate": 0.0002, + "loss": 0.4645, + "step": 4220 + }, + { + "epoch": 5.580474934036939, + "grad_norm": 1.0654675960540771, + "learning_rate": 0.0002, + "loss": 0.5087, + "step": 4230 + }, + { + "epoch": 5.593667546174142, + "grad_norm": 1.0776735544204712, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 4240 + }, + { + "epoch": 5.6068601583113455, + "grad_norm": 1.498723030090332, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 4250 + }, + { + "epoch": 5.620052770448549, + "grad_norm": 1.006768822669983, + "learning_rate": 0.0002, + "loss": 0.4788, + "step": 4260 + }, + { + "epoch": 5.633245382585752, + "grad_norm": 0.9194242358207703, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 4270 + }, + { + "epoch": 5.646437994722955, + "grad_norm": 1.1028380393981934, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 4280 + }, + { + "epoch": 5.6596306068601585, + "grad_norm": 0.9972755312919617, + "learning_rate": 0.0002, + "loss": 0.5319, + "step": 4290 + }, + { + "epoch": 5.672823218997362, + "grad_norm": 1.0509438514709473, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 4300 + }, + { + "epoch": 5.686015831134565, + "grad_norm": 1.064039945602417, + "learning_rate": 0.0002, + "loss": 0.4738, + "step": 4310 + }, + { + "epoch": 5.699208443271768, + "grad_norm": 0.9572229981422424, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 4320 + }, + { + "epoch": 5.7124010554089715, + "grad_norm": 0.9956564903259277, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 4330 + }, + { + "epoch": 5.725593667546174, + "grad_norm": 1.01974618434906, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 4340 + }, + { + "epoch": 5.738786279683377, + "grad_norm": 1.101328730583191, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 4350 + }, + { + "epoch": 5.75197889182058, + "grad_norm": 0.9971756935119629, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 4360 + }, + { + "epoch": 5.7651715039577835, + "grad_norm": 0.8579474687576294, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 4370 + }, + { + "epoch": 5.778364116094987, + "grad_norm": 0.9927367568016052, + "learning_rate": 0.0002, + "loss": 0.5106, + "step": 4380 + }, + { + "epoch": 5.79155672823219, + "grad_norm": 1.1183884143829346, + "learning_rate": 0.0002, + "loss": 0.5354, + "step": 4390 + }, + { + "epoch": 5.804749340369393, + "grad_norm": 0.7695905566215515, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 4400 + }, + { + "epoch": 5.8179419525065965, + "grad_norm": 1.1102122068405151, + "learning_rate": 0.0002, + "loss": 0.5137, + "step": 4410 + }, + { + "epoch": 5.8311345646438, + "grad_norm": 1.3201336860656738, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 4420 + }, + { + "epoch": 5.844327176781003, + "grad_norm": 1.1934558153152466, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 4430 + }, + { + "epoch": 5.857519788918205, + "grad_norm": 1.390870451927185, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 4440 + }, + { + "epoch": 5.870712401055409, + "grad_norm": 1.056314468383789, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 4450 + }, + { + "epoch": 5.883905013192612, + "grad_norm": 0.9797437191009521, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 4460 + }, + { + "epoch": 5.897097625329815, + "grad_norm": 1.2368146181106567, + "learning_rate": 0.0002, + "loss": 0.5479, + "step": 4470 + }, + { + "epoch": 5.910290237467018, + "grad_norm": 0.9062654376029968, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 4480 + }, + { + "epoch": 5.923482849604222, + "grad_norm": 1.8643536567687988, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 4490 + }, + { + "epoch": 5.936675461741425, + "grad_norm": 1.2977997064590454, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 4500 + }, + { + "epoch": 5.949868073878628, + "grad_norm": 0.8366201519966125, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 4510 + }, + { + "epoch": 5.963060686015831, + "grad_norm": 1.0210131406784058, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 4520 + }, + { + "epoch": 5.9762532981530345, + "grad_norm": 1.1287827491760254, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 4530 + }, + { + "epoch": 5.989445910290238, + "grad_norm": 1.0480493307113647, + "learning_rate": 0.0002, + "loss": 0.5053, + "step": 4540 + }, + { + "epoch": 6.0, + "eval_loss": 1.450880765914917, + "eval_runtime": 71.8135, + "eval_samples_per_second": 6.002, + "eval_steps_per_second": 0.752, + "step": 4548 + }, + { + "epoch": 6.002638522427441, + "grad_norm": 0.8589069247245789, + "learning_rate": 0.0002, + "loss": 0.5072, + "step": 4550 + }, + { + "epoch": 6.015831134564644, + "grad_norm": 1.467134714126587, + "learning_rate": 0.0002, + "loss": 0.4129, + "step": 4560 + }, + { + "epoch": 6.029023746701847, + "grad_norm": 1.1477625370025635, + "learning_rate": 0.0002, + "loss": 0.3739, + "step": 4570 + }, + { + "epoch": 6.04221635883905, + "grad_norm": 1.4254094362258911, + "learning_rate": 0.0002, + "loss": 0.3958, + "step": 4580 + }, + { + "epoch": 6.055408970976253, + "grad_norm": 1.3656290769577026, + "learning_rate": 0.0002, + "loss": 0.356, + "step": 4590 + }, + { + "epoch": 6.068601583113456, + "grad_norm": 0.9638674855232239, + "learning_rate": 0.0002, + "loss": 0.3626, + "step": 4600 + }, + { + "epoch": 6.08179419525066, + "grad_norm": 1.2654615640640259, + "learning_rate": 0.0002, + "loss": 0.3884, + "step": 4610 + }, + { + "epoch": 6.094986807387863, + "grad_norm": 1.4506969451904297, + "learning_rate": 0.0002, + "loss": 0.4659, + "step": 4620 + }, + { + "epoch": 6.108179419525066, + "grad_norm": 1.6596732139587402, + "learning_rate": 0.0002, + "loss": 0.3096, + "step": 4630 + }, + { + "epoch": 6.121372031662269, + "grad_norm": 1.5335280895233154, + "learning_rate": 0.0002, + "loss": 0.4005, + "step": 4640 + }, + { + "epoch": 6.1345646437994725, + "grad_norm": 1.0815565586090088, + "learning_rate": 0.0002, + "loss": 0.3999, + "step": 4650 + }, + { + "epoch": 6.147757255936676, + "grad_norm": 0.9995638132095337, + "learning_rate": 0.0002, + "loss": 0.4026, + "step": 4660 + }, + { + "epoch": 6.160949868073879, + "grad_norm": 0.8809106349945068, + "learning_rate": 0.0002, + "loss": 0.3548, + "step": 4670 + }, + { + "epoch": 6.174142480211081, + "grad_norm": 1.2946726083755493, + "learning_rate": 0.0002, + "loss": 0.4505, + "step": 4680 + }, + { + "epoch": 6.187335092348285, + "grad_norm": 1.311298131942749, + "learning_rate": 0.0002, + "loss": 0.4447, + "step": 4690 + }, + { + "epoch": 6.200527704485488, + "grad_norm": 1.229204535484314, + "learning_rate": 0.0002, + "loss": 0.4108, + "step": 4700 + }, + { + "epoch": 6.213720316622691, + "grad_norm": 1.0193822383880615, + "learning_rate": 0.0002, + "loss": 0.3764, + "step": 4710 + }, + { + "epoch": 6.226912928759894, + "grad_norm": 1.4438618421554565, + "learning_rate": 0.0002, + "loss": 0.3696, + "step": 4720 + }, + { + "epoch": 6.240105540897098, + "grad_norm": 1.4315637350082397, + "learning_rate": 0.0002, + "loss": 0.3979, + "step": 4730 + }, + { + "epoch": 6.253298153034301, + "grad_norm": 1.1291239261627197, + "learning_rate": 0.0002, + "loss": 0.4124, + "step": 4740 + }, + { + "epoch": 6.266490765171504, + "grad_norm": 0.9358022809028625, + "learning_rate": 0.0002, + "loss": 0.4337, + "step": 4750 + }, + { + "epoch": 6.279683377308707, + "grad_norm": 1.1260714530944824, + "learning_rate": 0.0002, + "loss": 0.3758, + "step": 4760 + }, + { + "epoch": 6.2928759894459105, + "grad_norm": 1.5400320291519165, + "learning_rate": 0.0002, + "loss": 0.4262, + "step": 4770 + }, + { + "epoch": 6.306068601583114, + "grad_norm": 1.6820714473724365, + "learning_rate": 0.0002, + "loss": 0.4105, + "step": 4780 + }, + { + "epoch": 6.319261213720317, + "grad_norm": 1.1937718391418457, + "learning_rate": 0.0002, + "loss": 0.4192, + "step": 4790 + }, + { + "epoch": 6.33245382585752, + "grad_norm": 1.4330145120620728, + "learning_rate": 0.0002, + "loss": 0.4519, + "step": 4800 + }, + { + "epoch": 6.345646437994723, + "grad_norm": 1.083373785018921, + "learning_rate": 0.0002, + "loss": 0.4173, + "step": 4810 + }, + { + "epoch": 6.358839050131926, + "grad_norm": 1.3013869524002075, + "learning_rate": 0.0002, + "loss": 0.4054, + "step": 4820 + }, + { + "epoch": 6.372031662269129, + "grad_norm": 1.1075547933578491, + "learning_rate": 0.0002, + "loss": 0.4177, + "step": 4830 + }, + { + "epoch": 6.385224274406332, + "grad_norm": 1.0480214357376099, + "learning_rate": 0.0002, + "loss": 0.3846, + "step": 4840 + }, + { + "epoch": 6.398416886543536, + "grad_norm": 1.3625658750534058, + "learning_rate": 0.0002, + "loss": 0.3924, + "step": 4850 + }, + { + "epoch": 6.411609498680739, + "grad_norm": 1.16606605052948, + "learning_rate": 0.0002, + "loss": 0.3964, + "step": 4860 + }, + { + "epoch": 6.424802110817942, + "grad_norm": 1.2435568571090698, + "learning_rate": 0.0002, + "loss": 0.4845, + "step": 4870 + }, + { + "epoch": 6.437994722955145, + "grad_norm": 1.4471954107284546, + "learning_rate": 0.0002, + "loss": 0.3847, + "step": 4880 + }, + { + "epoch": 6.4511873350923485, + "grad_norm": 1.2302275896072388, + "learning_rate": 0.0002, + "loss": 0.443, + "step": 4890 + }, + { + "epoch": 6.464379947229552, + "grad_norm": 1.2392226457595825, + "learning_rate": 0.0002, + "loss": 0.4458, + "step": 4900 + }, + { + "epoch": 6.477572559366755, + "grad_norm": 1.0497277975082397, + "learning_rate": 0.0002, + "loss": 0.4114, + "step": 4910 + }, + { + "epoch": 6.490765171503957, + "grad_norm": 1.3509557247161865, + "learning_rate": 0.0002, + "loss": 0.426, + "step": 4920 + }, + { + "epoch": 6.503957783641161, + "grad_norm": 1.340214729309082, + "learning_rate": 0.0002, + "loss": 0.4089, + "step": 4930 + }, + { + "epoch": 6.517150395778364, + "grad_norm": 1.283220648765564, + "learning_rate": 0.0002, + "loss": 0.4655, + "step": 4940 + }, + { + "epoch": 6.530343007915567, + "grad_norm": 1.0693278312683105, + "learning_rate": 0.0002, + "loss": 0.4205, + "step": 4950 + }, + { + "epoch": 6.54353562005277, + "grad_norm": 1.307997226715088, + "learning_rate": 0.0002, + "loss": 0.398, + "step": 4960 + }, + { + "epoch": 6.556728232189974, + "grad_norm": 1.1739027500152588, + "learning_rate": 0.0002, + "loss": 0.3844, + "step": 4970 + }, + { + "epoch": 6.569920844327177, + "grad_norm": 1.5694327354431152, + "learning_rate": 0.0002, + "loss": 0.4494, + "step": 4980 + }, + { + "epoch": 6.58311345646438, + "grad_norm": 0.9978346824645996, + "learning_rate": 0.0002, + "loss": 0.4535, + "step": 4990 + }, + { + "epoch": 6.596306068601583, + "grad_norm": 1.183057427406311, + "learning_rate": 0.0002, + "loss": 0.4755, + "step": 5000 + }, + { + "epoch": 6.6094986807387865, + "grad_norm": 1.1033718585968018, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 5010 + }, + { + "epoch": 6.62269129287599, + "grad_norm": 1.0699188709259033, + "learning_rate": 0.0002, + "loss": 0.4233, + "step": 5020 + }, + { + "epoch": 6.635883905013193, + "grad_norm": 1.491031289100647, + "learning_rate": 0.0002, + "loss": 0.4049, + "step": 5030 + }, + { + "epoch": 6.649076517150396, + "grad_norm": 0.7939618825912476, + "learning_rate": 0.0002, + "loss": 0.4257, + "step": 5040 + }, + { + "epoch": 6.662269129287599, + "grad_norm": 1.2883116006851196, + "learning_rate": 0.0002, + "loss": 0.4273, + "step": 5050 + }, + { + "epoch": 6.675461741424802, + "grad_norm": 1.3844388723373413, + "learning_rate": 0.0002, + "loss": 0.4376, + "step": 5060 + }, + { + "epoch": 6.688654353562005, + "grad_norm": 1.1823489665985107, + "learning_rate": 0.0002, + "loss": 0.4078, + "step": 5070 + }, + { + "epoch": 6.701846965699208, + "grad_norm": 1.310214638710022, + "learning_rate": 0.0002, + "loss": 0.4811, + "step": 5080 + }, + { + "epoch": 6.715039577836412, + "grad_norm": 1.6253955364227295, + "learning_rate": 0.0002, + "loss": 0.4675, + "step": 5090 + }, + { + "epoch": 6.728232189973615, + "grad_norm": 1.3344792127609253, + "learning_rate": 0.0002, + "loss": 0.4749, + "step": 5100 + }, + { + "epoch": 6.741424802110818, + "grad_norm": 1.3900614976882935, + "learning_rate": 0.0002, + "loss": 0.4051, + "step": 5110 + }, + { + "epoch": 6.754617414248021, + "grad_norm": 1.5122374296188354, + "learning_rate": 0.0002, + "loss": 0.3782, + "step": 5120 + }, + { + "epoch": 6.7678100263852246, + "grad_norm": 1.4738229513168335, + "learning_rate": 0.0002, + "loss": 0.4439, + "step": 5130 + }, + { + "epoch": 6.781002638522428, + "grad_norm": 1.0417664051055908, + "learning_rate": 0.0002, + "loss": 0.4237, + "step": 5140 + }, + { + "epoch": 6.79419525065963, + "grad_norm": 1.1339401006698608, + "learning_rate": 0.0002, + "loss": 0.486, + "step": 5150 + }, + { + "epoch": 6.807387862796833, + "grad_norm": 1.4377150535583496, + "learning_rate": 0.0002, + "loss": 0.4387, + "step": 5160 + }, + { + "epoch": 6.820580474934037, + "grad_norm": 1.3321975469589233, + "learning_rate": 0.0002, + "loss": 0.4375, + "step": 5170 + }, + { + "epoch": 6.83377308707124, + "grad_norm": 1.3799545764923096, + "learning_rate": 0.0002, + "loss": 0.4369, + "step": 5180 + }, + { + "epoch": 6.846965699208443, + "grad_norm": 0.864224374294281, + "learning_rate": 0.0002, + "loss": 0.4266, + "step": 5190 + }, + { + "epoch": 6.860158311345646, + "grad_norm": 1.0666139125823975, + "learning_rate": 0.0002, + "loss": 0.4455, + "step": 5200 + }, + { + "epoch": 6.87335092348285, + "grad_norm": 1.2926141023635864, + "learning_rate": 0.0002, + "loss": 0.4545, + "step": 5210 + }, + { + "epoch": 6.886543535620053, + "grad_norm": 1.2046207189559937, + "learning_rate": 0.0002, + "loss": 0.4441, + "step": 5220 + }, + { + "epoch": 6.899736147757256, + "grad_norm": 1.3961530923843384, + "learning_rate": 0.0002, + "loss": 0.4458, + "step": 5230 + }, + { + "epoch": 6.912928759894459, + "grad_norm": 1.1340336799621582, + "learning_rate": 0.0002, + "loss": 0.4343, + "step": 5240 + }, + { + "epoch": 6.926121372031663, + "grad_norm": 1.1756815910339355, + "learning_rate": 0.0002, + "loss": 0.4491, + "step": 5250 + }, + { + "epoch": 6.939313984168866, + "grad_norm": 1.146964192390442, + "learning_rate": 0.0002, + "loss": 0.4077, + "step": 5260 + }, + { + "epoch": 6.952506596306069, + "grad_norm": 1.2974623441696167, + "learning_rate": 0.0002, + "loss": 0.4232, + "step": 5270 + }, + { + "epoch": 6.965699208443271, + "grad_norm": 1.342126727104187, + "learning_rate": 0.0002, + "loss": 0.4126, + "step": 5280 + }, + { + "epoch": 6.978891820580475, + "grad_norm": 1.2475614547729492, + "learning_rate": 0.0002, + "loss": 0.4537, + "step": 5290 + }, + { + "epoch": 6.992084432717678, + "grad_norm": 1.254935622215271, + "learning_rate": 0.0002, + "loss": 0.456, + "step": 5300 + }, + { + "epoch": 7.0, + "eval_loss": 1.5579944849014282, + "eval_runtime": 71.7131, + "eval_samples_per_second": 6.01, + "eval_steps_per_second": 0.753, + "step": 5306 + }, + { + "epoch": 7.005277044854881, + "grad_norm": 0.7949880361557007, + "learning_rate": 0.0002, + "loss": 0.3784, + "step": 5310 + }, + { + "epoch": 7.018469656992084, + "grad_norm": 2.0586414337158203, + "learning_rate": 0.0002, + "loss": 0.3216, + "step": 5320 + }, + { + "epoch": 7.031662269129288, + "grad_norm": 1.0757979154586792, + "learning_rate": 0.0002, + "loss": 0.3071, + "step": 5330 + }, + { + "epoch": 7.044854881266491, + "grad_norm": 0.9700984358787537, + "learning_rate": 0.0002, + "loss": 0.2836, + "step": 5340 + }, + { + "epoch": 7.058047493403694, + "grad_norm": 1.016965389251709, + "learning_rate": 0.0002, + "loss": 0.2536, + "step": 5350 + }, + { + "epoch": 7.071240105540897, + "grad_norm": 1.223994493484497, + "learning_rate": 0.0002, + "loss": 0.3233, + "step": 5360 + }, + { + "epoch": 7.084432717678101, + "grad_norm": 2.044800043106079, + "learning_rate": 0.0002, + "loss": 0.3012, + "step": 5370 + }, + { + "epoch": 7.097625329815304, + "grad_norm": 1.1677180528640747, + "learning_rate": 0.0002, + "loss": 0.304, + "step": 5380 + }, + { + "epoch": 7.110817941952506, + "grad_norm": 1.8017300367355347, + "learning_rate": 0.0002, + "loss": 0.3193, + "step": 5390 + }, + { + "epoch": 7.124010554089709, + "grad_norm": 1.1814491748809814, + "learning_rate": 0.0002, + "loss": 0.3322, + "step": 5400 + }, + { + "epoch": 7.137203166226913, + "grad_norm": 1.835221767425537, + "learning_rate": 0.0002, + "loss": 0.3423, + "step": 5410 + }, + { + "epoch": 7.150395778364116, + "grad_norm": 1.7413564920425415, + "learning_rate": 0.0002, + "loss": 0.3179, + "step": 5420 + }, + { + "epoch": 7.163588390501319, + "grad_norm": 1.4341952800750732, + "learning_rate": 0.0002, + "loss": 0.2946, + "step": 5430 + }, + { + "epoch": 7.176781002638522, + "grad_norm": 1.1618049144744873, + "learning_rate": 0.0002, + "loss": 0.3111, + "step": 5440 + }, + { + "epoch": 7.189973614775726, + "grad_norm": 1.2117347717285156, + "learning_rate": 0.0002, + "loss": 0.3326, + "step": 5450 + }, + { + "epoch": 7.203166226912929, + "grad_norm": 1.4826463460922241, + "learning_rate": 0.0002, + "loss": 0.3403, + "step": 5460 + }, + { + "epoch": 7.216358839050132, + "grad_norm": 1.112357497215271, + "learning_rate": 0.0002, + "loss": 0.3087, + "step": 5470 + }, + { + "epoch": 7.229551451187335, + "grad_norm": 1.1144609451293945, + "learning_rate": 0.0002, + "loss": 0.3162, + "step": 5480 + }, + { + "epoch": 7.242744063324539, + "grad_norm": 1.2441258430480957, + "learning_rate": 0.0002, + "loss": 0.3446, + "step": 5490 + }, + { + "epoch": 7.255936675461742, + "grad_norm": 1.0532526969909668, + "learning_rate": 0.0002, + "loss": 0.341, + "step": 5500 + }, + { + "epoch": 7.269129287598945, + "grad_norm": 1.4295402765274048, + "learning_rate": 0.0002, + "loss": 0.3251, + "step": 5510 + }, + { + "epoch": 7.282321899736147, + "grad_norm": 1.3890503644943237, + "learning_rate": 0.0002, + "loss": 0.3254, + "step": 5520 + }, + { + "epoch": 7.295514511873351, + "grad_norm": 0.919006884098053, + "learning_rate": 0.0002, + "loss": 0.3459, + "step": 5530 + }, + { + "epoch": 7.308707124010554, + "grad_norm": 1.2184085845947266, + "learning_rate": 0.0002, + "loss": 0.3313, + "step": 5540 + }, + { + "epoch": 7.321899736147757, + "grad_norm": 1.0661242008209229, + "learning_rate": 0.0002, + "loss": 0.3581, + "step": 5550 + }, + { + "epoch": 7.33509234828496, + "grad_norm": 1.331189751625061, + "learning_rate": 0.0002, + "loss": 0.3211, + "step": 5560 + }, + { + "epoch": 7.348284960422164, + "grad_norm": 1.1899065971374512, + "learning_rate": 0.0002, + "loss": 0.3303, + "step": 5570 + }, + { + "epoch": 7.361477572559367, + "grad_norm": 0.9958152174949646, + "learning_rate": 0.0002, + "loss": 0.3345, + "step": 5580 + }, + { + "epoch": 7.37467018469657, + "grad_norm": 1.2326462268829346, + "learning_rate": 0.0002, + "loss": 0.311, + "step": 5590 + }, + { + "epoch": 7.387862796833773, + "grad_norm": 1.4610025882720947, + "learning_rate": 0.0002, + "loss": 0.3459, + "step": 5600 + }, + { + "epoch": 7.401055408970977, + "grad_norm": 1.0228832960128784, + "learning_rate": 0.0002, + "loss": 0.3343, + "step": 5610 + }, + { + "epoch": 7.41424802110818, + "grad_norm": 1.2726085186004639, + "learning_rate": 0.0002, + "loss": 0.331, + "step": 5620 + }, + { + "epoch": 7.427440633245382, + "grad_norm": 1.1658830642700195, + "learning_rate": 0.0002, + "loss": 0.3156, + "step": 5630 + }, + { + "epoch": 7.440633245382585, + "grad_norm": 1.0791388750076294, + "learning_rate": 0.0002, + "loss": 0.3463, + "step": 5640 + }, + { + "epoch": 7.453825857519789, + "grad_norm": 1.4051549434661865, + "learning_rate": 0.0002, + "loss": 0.3457, + "step": 5650 + }, + { + "epoch": 7.467018469656992, + "grad_norm": 1.7039124965667725, + "learning_rate": 0.0002, + "loss": 0.3251, + "step": 5660 + }, + { + "epoch": 7.480211081794195, + "grad_norm": 1.5712453126907349, + "learning_rate": 0.0002, + "loss": 0.3655, + "step": 5670 + }, + { + "epoch": 7.493403693931398, + "grad_norm": 1.1755692958831787, + "learning_rate": 0.0002, + "loss": 0.3759, + "step": 5680 + }, + { + "epoch": 7.506596306068602, + "grad_norm": 0.7768910527229309, + "learning_rate": 0.0002, + "loss": 0.3212, + "step": 5690 + }, + { + "epoch": 7.519788918205805, + "grad_norm": 1.34855318069458, + "learning_rate": 0.0002, + "loss": 0.3953, + "step": 5700 + }, + { + "epoch": 7.532981530343008, + "grad_norm": 1.326443076133728, + "learning_rate": 0.0002, + "loss": 0.3139, + "step": 5710 + }, + { + "epoch": 7.546174142480211, + "grad_norm": 1.2597885131835938, + "learning_rate": 0.0002, + "loss": 0.3536, + "step": 5720 + }, + { + "epoch": 7.559366754617415, + "grad_norm": 1.0863240957260132, + "learning_rate": 0.0002, + "loss": 0.334, + "step": 5730 + }, + { + "epoch": 7.572559366754618, + "grad_norm": 1.2254612445831299, + "learning_rate": 0.0002, + "loss": 0.3408, + "step": 5740 + }, + { + "epoch": 7.585751978891821, + "grad_norm": 1.4157414436340332, + "learning_rate": 0.0002, + "loss": 0.3675, + "step": 5750 + }, + { + "epoch": 7.598944591029023, + "grad_norm": 1.1378470659255981, + "learning_rate": 0.0002, + "loss": 0.3843, + "step": 5760 + }, + { + "epoch": 7.612137203166227, + "grad_norm": 1.1139744520187378, + "learning_rate": 0.0002, + "loss": 0.3812, + "step": 5770 + }, + { + "epoch": 7.62532981530343, + "grad_norm": 1.3163728713989258, + "learning_rate": 0.0002, + "loss": 0.3238, + "step": 5780 + }, + { + "epoch": 7.638522427440633, + "grad_norm": 1.0113680362701416, + "learning_rate": 0.0002, + "loss": 0.3459, + "step": 5790 + }, + { + "epoch": 7.651715039577836, + "grad_norm": 0.918424665927887, + "learning_rate": 0.0002, + "loss": 0.3554, + "step": 5800 + }, + { + "epoch": 7.66490765171504, + "grad_norm": 1.1702263355255127, + "learning_rate": 0.0002, + "loss": 0.3949, + "step": 5810 + }, + { + "epoch": 7.678100263852243, + "grad_norm": 1.4807580709457397, + "learning_rate": 0.0002, + "loss": 0.3378, + "step": 5820 + }, + { + "epoch": 7.691292875989446, + "grad_norm": 1.0703623294830322, + "learning_rate": 0.0002, + "loss": 0.3677, + "step": 5830 + }, + { + "epoch": 7.704485488126649, + "grad_norm": 1.2308809757232666, + "learning_rate": 0.0002, + "loss": 0.3524, + "step": 5840 + }, + { + "epoch": 7.717678100263853, + "grad_norm": 1.212863564491272, + "learning_rate": 0.0002, + "loss": 0.3326, + "step": 5850 + }, + { + "epoch": 7.730870712401055, + "grad_norm": 1.0400227308273315, + "learning_rate": 0.0002, + "loss": 0.3909, + "step": 5860 + }, + { + "epoch": 7.744063324538258, + "grad_norm": 1.2876183986663818, + "learning_rate": 0.0002, + "loss": 0.3312, + "step": 5870 + }, + { + "epoch": 7.757255936675461, + "grad_norm": 1.0517319440841675, + "learning_rate": 0.0002, + "loss": 0.3149, + "step": 5880 + }, + { + "epoch": 7.770448548812665, + "grad_norm": 1.091901183128357, + "learning_rate": 0.0002, + "loss": 0.3777, + "step": 5890 + }, + { + "epoch": 7.783641160949868, + "grad_norm": 1.3892148733139038, + "learning_rate": 0.0002, + "loss": 0.3959, + "step": 5900 + }, + { + "epoch": 7.796833773087071, + "grad_norm": 1.4618996381759644, + "learning_rate": 0.0002, + "loss": 0.3991, + "step": 5910 + }, + { + "epoch": 7.810026385224274, + "grad_norm": 1.3962730169296265, + "learning_rate": 0.0002, + "loss": 0.3611, + "step": 5920 + }, + { + "epoch": 7.823218997361478, + "grad_norm": 1.249474048614502, + "learning_rate": 0.0002, + "loss": 0.3597, + "step": 5930 + }, + { + "epoch": 7.836411609498681, + "grad_norm": 1.3841967582702637, + "learning_rate": 0.0002, + "loss": 0.3604, + "step": 5940 + }, + { + "epoch": 7.849604221635884, + "grad_norm": 1.2477777004241943, + "learning_rate": 0.0002, + "loss": 0.3417, + "step": 5950 + }, + { + "epoch": 7.862796833773087, + "grad_norm": 1.3400548696517944, + "learning_rate": 0.0002, + "loss": 0.3568, + "step": 5960 + }, + { + "epoch": 7.875989445910291, + "grad_norm": 1.383649468421936, + "learning_rate": 0.0002, + "loss": 0.372, + "step": 5970 + }, + { + "epoch": 7.889182058047494, + "grad_norm": 1.124591588973999, + "learning_rate": 0.0002, + "loss": 0.3554, + "step": 5980 + }, + { + "epoch": 7.902374670184696, + "grad_norm": 1.2731496095657349, + "learning_rate": 0.0002, + "loss": 0.3458, + "step": 5990 + }, + { + "epoch": 7.915567282321899, + "grad_norm": 1.61614990234375, + "learning_rate": 0.0002, + "loss": 0.3558, + "step": 6000 + }, + { + "epoch": 7.928759894459103, + "grad_norm": 1.0083316564559937, + "learning_rate": 0.0002, + "loss": 0.35, + "step": 6010 + }, + { + "epoch": 7.941952506596306, + "grad_norm": 1.3074530363082886, + "learning_rate": 0.0002, + "loss": 0.3657, + "step": 6020 + }, + { + "epoch": 7.955145118733509, + "grad_norm": 1.3631811141967773, + "learning_rate": 0.0002, + "loss": 0.3872, + "step": 6030 + }, + { + "epoch": 7.968337730870712, + "grad_norm": 1.3127434253692627, + "learning_rate": 0.0002, + "loss": 0.3371, + "step": 6040 + }, + { + "epoch": 7.981530343007916, + "grad_norm": 1.6356911659240723, + "learning_rate": 0.0002, + "loss": 0.3446, + "step": 6050 + }, + { + "epoch": 7.994722955145119, + "grad_norm": 1.2134562730789185, + "learning_rate": 0.0002, + "loss": 0.3513, + "step": 6060 + }, + { + "epoch": 8.0, + "eval_loss": 1.6480025053024292, + "eval_runtime": 71.5419, + "eval_samples_per_second": 6.024, + "eval_steps_per_second": 0.755, + "step": 6064 + } + ], + "logging_steps": 10, + "max_steps": 6064, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.8062834215406797e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0c0289389bcb2236bf804b0a82c50e67c11a1379 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-6064/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e04722e246164eebe6b4f70dd50fe946c936a2bd25edd75bcedcfd4c7ff9474e +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1dda90d5cdf5953c305537a5c4dc2a7e97c1e4c0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89a0bd43fb22cc13a9768677096e302fb2d76a95e2ff828ae829a567e0763cea +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bab08f6ef2c9b54dbc425e3bb1edc419e5e08206 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11c21807ab5168ac0a4ead2be60f4c13042ddf52c66a8b8ca69bc002c1e8dea7 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..09514235cff1c0ea3c7ce6052bbae6b193af6217 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fba17ad1a8d948c05f3e58f0bec6dd0e8164f67341e090c9f51e6e482c7b6db +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..38631c6be45038fde619ee85a912e3ed4ee682d2 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c155da7d7721b1c1612882c998cef1b0caf8c39541aff6b4e61f7d33dc46f52c +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d6bfde5994654e9b2566b4fd18c54b438e8f4872 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/trainer_state.json @@ -0,0 +1,566 @@ +{ + "best_metric": 1.203244686126709, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758", + "epoch": 1.0, + "eval_steps": 10, + "global_step": 758, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013192612137203167, + "grad_norm": 0.7545632123947144, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 10 + }, + { + "epoch": 0.026385224274406333, + "grad_norm": 0.5787661075592041, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 20 + }, + { + "epoch": 0.0395778364116095, + "grad_norm": 0.8616093993186951, + "learning_rate": 0.0002, + "loss": 1.3906, + "step": 30 + }, + { + "epoch": 0.052770448548812667, + "grad_norm": 0.42088547348976135, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 40 + }, + { + "epoch": 0.06596306068601583, + "grad_norm": 0.47704678773880005, + "learning_rate": 0.0002, + "loss": 1.3446, + "step": 50 + }, + { + "epoch": 0.079155672823219, + "grad_norm": 0.5763994455337524, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 60 + }, + { + "epoch": 0.09234828496042216, + "grad_norm": 0.4579846262931824, + "learning_rate": 0.0002, + "loss": 1.2268, + "step": 70 + }, + { + "epoch": 0.10554089709762533, + "grad_norm": 0.46623846888542175, + "learning_rate": 0.0002, + "loss": 1.415, + "step": 80 + }, + { + "epoch": 0.11873350923482849, + "grad_norm": 0.4206956624984741, + "learning_rate": 0.0002, + "loss": 1.2849, + "step": 90 + }, + { + "epoch": 0.13192612137203166, + "grad_norm": 0.41896629333496094, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 100 + }, + { + "epoch": 0.14511873350923482, + "grad_norm": 0.3459089398384094, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 110 + }, + { + "epoch": 0.158311345646438, + "grad_norm": 0.4587327837944031, + "learning_rate": 0.0002, + "loss": 1.2858, + "step": 120 + }, + { + "epoch": 0.17150395778364116, + "grad_norm": 0.433525413274765, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 130 + }, + { + "epoch": 0.18469656992084432, + "grad_norm": 0.39253175258636475, + "learning_rate": 0.0002, + "loss": 1.2269, + "step": 140 + }, + { + "epoch": 0.19788918205804748, + "grad_norm": 0.3602290749549866, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 150 + }, + { + "epoch": 0.21108179419525067, + "grad_norm": 0.41160839796066284, + "learning_rate": 0.0002, + "loss": 1.2834, + "step": 160 + }, + { + "epoch": 0.22427440633245382, + "grad_norm": 0.7213630080223083, + "learning_rate": 0.0002, + "loss": 1.1986, + "step": 170 + }, + { + "epoch": 0.23746701846965698, + "grad_norm": 0.39086055755615234, + "learning_rate": 0.0002, + "loss": 1.2215, + "step": 180 + }, + { + "epoch": 0.25065963060686014, + "grad_norm": 0.4465520977973938, + "learning_rate": 0.0002, + "loss": 1.3315, + "step": 190 + }, + { + "epoch": 0.2638522427440633, + "grad_norm": 1.814679741859436, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 200 + }, + { + "epoch": 0.2770448548812665, + "grad_norm": 0.5026423931121826, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 210 + }, + { + "epoch": 0.29023746701846964, + "grad_norm": 0.4156292974948883, + "learning_rate": 0.0002, + "loss": 1.3306, + "step": 220 + }, + { + "epoch": 0.3034300791556728, + "grad_norm": 0.40813493728637695, + "learning_rate": 0.0002, + "loss": 1.266, + "step": 230 + }, + { + "epoch": 0.316622691292876, + "grad_norm": 0.3304787874221802, + "learning_rate": 0.0002, + "loss": 1.1533, + "step": 240 + }, + { + "epoch": 0.32981530343007914, + "grad_norm": 0.46139976382255554, + "learning_rate": 0.0002, + "loss": 1.3154, + "step": 250 + }, + { + "epoch": 0.34300791556728233, + "grad_norm": 0.37518271803855896, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 260 + }, + { + "epoch": 0.3562005277044855, + "grad_norm": 0.35586467385292053, + "learning_rate": 0.0002, + "loss": 1.2004, + "step": 270 + }, + { + "epoch": 0.36939313984168864, + "grad_norm": 0.32441186904907227, + "learning_rate": 0.0002, + "loss": 1.2609, + "step": 280 + }, + { + "epoch": 0.38258575197889183, + "grad_norm": 0.3198683261871338, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 290 + }, + { + "epoch": 0.39577836411609496, + "grad_norm": 0.33663108944892883, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 300 + }, + { + "epoch": 0.40897097625329815, + "grad_norm": 0.3711244761943817, + "learning_rate": 0.0002, + "loss": 1.1702, + "step": 310 + }, + { + "epoch": 0.42216358839050133, + "grad_norm": 0.3209651708602905, + "learning_rate": 0.0002, + "loss": 1.0871, + "step": 320 + }, + { + "epoch": 0.43535620052770446, + "grad_norm": 0.5152716040611267, + "learning_rate": 0.0002, + "loss": 1.2728, + "step": 330 + }, + { + "epoch": 0.44854881266490765, + "grad_norm": 0.5431376695632935, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 340 + }, + { + "epoch": 0.46174142480211083, + "grad_norm": 0.3069997727870941, + "learning_rate": 0.0002, + "loss": 1.1344, + "step": 350 + }, + { + "epoch": 0.47493403693931396, + "grad_norm": 0.34260064363479614, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 360 + }, + { + "epoch": 0.48812664907651715, + "grad_norm": 0.345653235912323, + "learning_rate": 0.0002, + "loss": 1.3161, + "step": 370 + }, + { + "epoch": 0.5013192612137203, + "grad_norm": 0.46222734451293945, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 380 + }, + { + "epoch": 0.5145118733509235, + "grad_norm": 0.27301734685897827, + "learning_rate": 0.0002, + "loss": 1.0616, + "step": 390 + }, + { + "epoch": 0.5277044854881267, + "grad_norm": 0.29048439860343933, + "learning_rate": 0.0002, + "loss": 1.1944, + "step": 400 + }, + { + "epoch": 0.5408970976253298, + "grad_norm": 0.32927802205085754, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 410 + }, + { + "epoch": 0.554089709762533, + "grad_norm": 0.3336397409439087, + "learning_rate": 0.0002, + "loss": 1.2362, + "step": 420 + }, + { + "epoch": 0.5672823218997362, + "grad_norm": 0.4007597267627716, + "learning_rate": 0.0002, + "loss": 1.1445, + "step": 430 + }, + { + "epoch": 0.5804749340369393, + "grad_norm": 0.36144956946372986, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 440 + }, + { + "epoch": 0.5936675461741425, + "grad_norm": 0.6331009864807129, + "learning_rate": 0.0002, + "loss": 1.1604, + "step": 450 + }, + { + "epoch": 0.6068601583113457, + "grad_norm": 0.41469088196754456, + "learning_rate": 0.0002, + "loss": 1.1692, + "step": 460 + }, + { + "epoch": 0.6200527704485488, + "grad_norm": 0.4388185143470764, + "learning_rate": 0.0002, + "loss": 1.2372, + "step": 470 + }, + { + "epoch": 0.633245382585752, + "grad_norm": 0.3738141655921936, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 480 + }, + { + "epoch": 0.6464379947229552, + "grad_norm": 0.7212023138999939, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 490 + }, + { + "epoch": 0.6596306068601583, + "grad_norm": 0.2972351014614105, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 500 + }, + { + "epoch": 0.6728232189973615, + "grad_norm": 0.45293179154396057, + "learning_rate": 0.0002, + "loss": 1.1739, + "step": 510 + }, + { + "epoch": 0.6860158311345647, + "grad_norm": 0.4319860637187958, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 520 + }, + { + "epoch": 0.6992084432717678, + "grad_norm": 0.3050215542316437, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 530 + }, + { + "epoch": 0.712401055408971, + "grad_norm": 0.3552611172199249, + "learning_rate": 0.0002, + "loss": 1.1608, + "step": 540 + }, + { + "epoch": 0.7255936675461742, + "grad_norm": 0.3631151020526886, + "learning_rate": 0.0002, + "loss": 1.2546, + "step": 550 + }, + { + "epoch": 0.7387862796833773, + "grad_norm": 0.28177931904792786, + "learning_rate": 0.0002, + "loss": 1.1371, + "step": 560 + }, + { + "epoch": 0.7519788918205804, + "grad_norm": 0.359764039516449, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 570 + }, + { + "epoch": 0.7651715039577837, + "grad_norm": 0.3970327377319336, + "learning_rate": 0.0002, + "loss": 1.1356, + "step": 580 + }, + { + "epoch": 0.7783641160949868, + "grad_norm": 0.3541001081466675, + "learning_rate": 0.0002, + "loss": 1.309, + "step": 590 + }, + { + "epoch": 0.7915567282321899, + "grad_norm": 0.3478573262691498, + "learning_rate": 0.0002, + "loss": 1.2061, + "step": 600 + }, + { + "epoch": 0.8047493403693932, + "grad_norm": 0.3900321424007416, + "learning_rate": 0.0002, + "loss": 1.1864, + "step": 610 + }, + { + "epoch": 0.8179419525065963, + "grad_norm": 0.3443238437175751, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 620 + }, + { + "epoch": 0.8311345646437994, + "grad_norm": 0.44238781929016113, + "learning_rate": 0.0002, + "loss": 1.3232, + "step": 630 + }, + { + "epoch": 0.8443271767810027, + "grad_norm": 0.36339467763900757, + "learning_rate": 0.0002, + "loss": 1.1247, + "step": 640 + }, + { + "epoch": 0.8575197889182058, + "grad_norm": 0.6243070363998413, + "learning_rate": 0.0002, + "loss": 1.1049, + "step": 650 + }, + { + "epoch": 0.8707124010554089, + "grad_norm": 0.3209173381328583, + "learning_rate": 0.0002, + "loss": 1.1943, + "step": 660 + }, + { + "epoch": 0.8839050131926122, + "grad_norm": 0.35017991065979004, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 670 + }, + { + "epoch": 0.8970976253298153, + "grad_norm": 0.3247159421443939, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 680 + }, + { + "epoch": 0.9102902374670184, + "grad_norm": 0.4091894030570984, + "learning_rate": 0.0002, + "loss": 1.2298, + "step": 690 + }, + { + "epoch": 0.9234828496042217, + "grad_norm": 0.3975585997104645, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 700 + }, + { + "epoch": 0.9366754617414248, + "grad_norm": 0.3666245937347412, + "learning_rate": 0.0002, + "loss": 1.281, + "step": 710 + }, + { + "epoch": 0.9498680738786279, + "grad_norm": 0.45216917991638184, + "learning_rate": 0.0002, + "loss": 1.2855, + "step": 720 + }, + { + "epoch": 0.9630606860158312, + "grad_norm": 0.36108118295669556, + "learning_rate": 0.0002, + "loss": 1.2217, + "step": 730 + }, + { + "epoch": 0.9762532981530343, + "grad_norm": 0.44550251960754395, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 740 + }, + { + "epoch": 0.9894459102902374, + "grad_norm": 0.29801255464553833, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 750 + }, + { + "epoch": 1.0, + "eval_loss": 1.203244686126709, + "eval_runtime": 76.0457, + "eval_samples_per_second": 5.668, + "eval_steps_per_second": 0.71, + "step": 758 + } + ], + "logging_steps": 10, + "max_steps": 6064, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.5078542769258496e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0c0289389bcb2236bf804b0a82c50e67c11a1379 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e04722e246164eebe6b4f70dd50fe946c936a2bd25edd75bcedcfd4c7ff9474e +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0c0289389bcb2236bf804b0a82c50e67c11a1379 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e04722e246164eebe6b4f70dd50fe946c936a2bd25edd75bcedcfd4c7ff9474e +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f13a8d4d46ee691b882c4fc40f061c5be91ac97e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 1.0, "step": 758, "epoch_duration": 2790.672851085663, "total_accumulated_duration": 2790.672851085663, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9466, "grad_norm": 0.7545632123947144, "learning_rate": 0.0002, "epoch": 0.013192612137203167, "step": 10}, {"loss": 1.4909, "grad_norm": 0.5787661075592041, "learning_rate": 0.0002, "epoch": 0.026385224274406333, "step": 20}, {"loss": 1.3906, "grad_norm": 0.8616093993186951, "learning_rate": 0.0002, "epoch": 0.0395778364116095, "step": 30}, {"loss": 1.4116, "grad_norm": 0.42088547348976135, "learning_rate": 0.0002, "epoch": 0.052770448548812667, "step": 40}, {"loss": 1.3446, "grad_norm": 0.47704678773880005, "learning_rate": 0.0002, "epoch": 0.06596306068601583, "step": 50}, {"loss": 1.2476, "grad_norm": 0.5763994455337524, "learning_rate": 0.0002, "epoch": 0.079155672823219, "step": 60}, {"loss": 1.2268, "grad_norm": 0.4579846262931824, "learning_rate": 0.0002, "epoch": 0.09234828496042216, "step": 70}, {"loss": 1.415, "grad_norm": 0.46623846888542175, "learning_rate": 0.0002, "epoch": 0.10554089709762533, "step": 80}, {"loss": 1.2849, "grad_norm": 0.4206956624984741, "learning_rate": 0.0002, "epoch": 0.11873350923482849, "step": 90}, {"loss": 1.1608, "grad_norm": 0.41896629333496094, "learning_rate": 0.0002, "epoch": 0.13192612137203166, "step": 100}, {"loss": 1.2617, "grad_norm": 0.3459089398384094, "learning_rate": 0.0002, "epoch": 0.14511873350923482, "step": 110}, {"loss": 1.2858, "grad_norm": 0.4587327837944031, "learning_rate": 0.0002, "epoch": 0.158311345646438, "step": 120}, {"loss": 1.2241, "grad_norm": 0.433525413274765, "learning_rate": 0.0002, "epoch": 0.17150395778364116, "step": 130}, {"loss": 1.2269, "grad_norm": 0.39253175258636475, "learning_rate": 0.0002, "epoch": 0.18469656992084432, "step": 140}, {"loss": 1.2224, "grad_norm": 0.3602290749549866, "learning_rate": 0.0002, "epoch": 0.19788918205804748, "step": 150}, {"loss": 1.2834, "grad_norm": 0.41160839796066284, "learning_rate": 0.0002, "epoch": 0.21108179419525067, "step": 160}, {"loss": 1.1986, "grad_norm": 0.7213630080223083, "learning_rate": 0.0002, "epoch": 0.22427440633245382, "step": 170}, {"loss": 1.2215, "grad_norm": 0.39086055755615234, "learning_rate": 0.0002, "epoch": 0.23746701846965698, "step": 180}, {"loss": 1.3315, "grad_norm": 0.4465520977973938, "learning_rate": 0.0002, "epoch": 0.25065963060686014, "step": 190}, {"loss": 1.3798, "grad_norm": 1.814679741859436, "learning_rate": 0.0002, "epoch": 0.2638522427440633, "step": 200}, {"loss": 1.2259, "grad_norm": 0.5026423931121826, "learning_rate": 0.0002, "epoch": 0.2770448548812665, "step": 210}, {"loss": 1.3306, "grad_norm": 0.4156292974948883, "learning_rate": 0.0002, "epoch": 0.29023746701846964, "step": 220}, {"loss": 1.266, "grad_norm": 0.40813493728637695, "learning_rate": 0.0002, "epoch": 0.3034300791556728, "step": 230}, {"loss": 1.1533, "grad_norm": 0.3304787874221802, "learning_rate": 0.0002, "epoch": 0.316622691292876, "step": 240}, {"loss": 1.3154, "grad_norm": 0.46139976382255554, "learning_rate": 0.0002, "epoch": 0.32981530343007914, "step": 250}, {"loss": 1.1365, "grad_norm": 0.37518271803855896, "learning_rate": 0.0002, "epoch": 0.34300791556728233, "step": 260}, {"loss": 1.2004, "grad_norm": 0.35586467385292053, "learning_rate": 0.0002, "epoch": 0.3562005277044855, "step": 270}, {"loss": 1.2609, "grad_norm": 0.32441186904907227, "learning_rate": 0.0002, "epoch": 0.36939313984168864, "step": 280}, {"loss": 1.212, "grad_norm": 0.3198683261871338, "learning_rate": 0.0002, "epoch": 0.38258575197889183, "step": 290}, {"loss": 1.2465, "grad_norm": 0.33663108944892883, "learning_rate": 0.0002, "epoch": 0.39577836411609496, "step": 300}, {"loss": 1.1702, "grad_norm": 0.3711244761943817, "learning_rate": 0.0002, "epoch": 0.40897097625329815, "step": 310}, {"loss": 1.0871, "grad_norm": 0.3209651708602905, "learning_rate": 0.0002, "epoch": 0.42216358839050133, "step": 320}, {"loss": 1.2728, "grad_norm": 0.5152716040611267, "learning_rate": 0.0002, "epoch": 0.43535620052770446, "step": 330}, {"loss": 1.1833, "grad_norm": 0.5431376695632935, "learning_rate": 0.0002, "epoch": 0.44854881266490765, "step": 340}, {"loss": 1.1344, "grad_norm": 0.3069997727870941, "learning_rate": 0.0002, "epoch": 0.46174142480211083, "step": 350}, {"loss": 1.2742, "grad_norm": 0.34260064363479614, "learning_rate": 0.0002, "epoch": 0.47493403693931396, "step": 360}, {"loss": 1.3161, "grad_norm": 0.345653235912323, "learning_rate": 0.0002, "epoch": 0.48812664907651715, "step": 370}, {"loss": 1.1552, "grad_norm": 0.46222734451293945, "learning_rate": 0.0002, "epoch": 0.5013192612137203, "step": 380}, {"loss": 1.0616, "grad_norm": 0.27301734685897827, "learning_rate": 0.0002, "epoch": 0.5145118733509235, "step": 390}, {"loss": 1.1944, "grad_norm": 0.29048439860343933, "learning_rate": 0.0002, "epoch": 0.5277044854881267, "step": 400}, {"loss": 1.0956, "grad_norm": 0.32927802205085754, "learning_rate": 0.0002, "epoch": 0.5408970976253298, "step": 410}, {"loss": 1.2362, "grad_norm": 0.3336397409439087, "learning_rate": 0.0002, "epoch": 0.554089709762533, "step": 420}, {"loss": 1.1445, "grad_norm": 0.4007597267627716, "learning_rate": 0.0002, "epoch": 0.5672823218997362, "step": 430}, {"loss": 1.2731, "grad_norm": 0.36144956946372986, "learning_rate": 0.0002, "epoch": 0.5804749340369393, "step": 440}, {"loss": 1.1604, "grad_norm": 0.6331009864807129, "learning_rate": 0.0002, "epoch": 0.5936675461741425, "step": 450}, {"loss": 1.1692, "grad_norm": 0.41469088196754456, "learning_rate": 0.0002, "epoch": 0.6068601583113457, "step": 460}, {"loss": 1.2372, "grad_norm": 0.4388185143470764, "learning_rate": 0.0002, "epoch": 0.6200527704485488, "step": 470}, {"loss": 1.2541, "grad_norm": 0.3738141655921936, "learning_rate": 0.0002, "epoch": 0.633245382585752, "step": 480}, {"loss": 1.1265, "grad_norm": 0.7212023138999939, "learning_rate": 0.0002, "epoch": 0.6464379947229552, "step": 490}, {"loss": 1.2786, "grad_norm": 0.2972351014614105, "learning_rate": 0.0002, "epoch": 0.6596306068601583, "step": 500}, {"loss": 1.1739, "grad_norm": 0.45293179154396057, "learning_rate": 0.0002, "epoch": 0.6728232189973615, "step": 510}, {"loss": 1.255, "grad_norm": 0.4319860637187958, "learning_rate": 0.0002, "epoch": 0.6860158311345647, "step": 520}, {"loss": 1.0959, "grad_norm": 0.3050215542316437, "learning_rate": 0.0002, "epoch": 0.6992084432717678, "step": 530}, {"loss": 1.1608, "grad_norm": 0.3552611172199249, "learning_rate": 0.0002, "epoch": 0.712401055408971, "step": 540}, {"loss": 1.2546, "grad_norm": 0.3631151020526886, "learning_rate": 0.0002, "epoch": 0.7255936675461742, "step": 550}, {"loss": 1.1371, "grad_norm": 0.28177931904792786, "learning_rate": 0.0002, "epoch": 0.7387862796833773, "step": 560}, {"loss": 1.2081, "grad_norm": 0.359764039516449, "learning_rate": 0.0002, "epoch": 0.7519788918205804, "step": 570}, {"loss": 1.1356, "grad_norm": 0.3970327377319336, "learning_rate": 0.0002, "epoch": 0.7651715039577837, "step": 580}, {"loss": 1.309, "grad_norm": 0.3541001081466675, "learning_rate": 0.0002, "epoch": 0.7783641160949868, "step": 590}, {"loss": 1.2061, "grad_norm": 0.3478573262691498, "learning_rate": 0.0002, "epoch": 0.7915567282321899, "step": 600}, {"loss": 1.1864, "grad_norm": 0.3900321424007416, "learning_rate": 0.0002, "epoch": 0.8047493403693932, "step": 610}, {"loss": 1.1358, "grad_norm": 0.3443238437175751, "learning_rate": 0.0002, "epoch": 0.8179419525065963, "step": 620}, {"loss": 1.3232, "grad_norm": 0.44238781929016113, "learning_rate": 0.0002, "epoch": 0.8311345646437994, "step": 630}, {"loss": 1.1247, "grad_norm": 0.36339467763900757, "learning_rate": 0.0002, "epoch": 0.8443271767810027, "step": 640}, {"loss": 1.1049, "grad_norm": 0.6243070363998413, "learning_rate": 0.0002, "epoch": 0.8575197889182058, "step": 650}, {"loss": 1.1943, "grad_norm": 0.3209173381328583, "learning_rate": 0.0002, "epoch": 0.8707124010554089, "step": 660}, {"loss": 1.1927, "grad_norm": 0.35017991065979004, "learning_rate": 0.0002, "epoch": 0.8839050131926122, "step": 670}, {"loss": 1.1603, "grad_norm": 0.3247159421443939, "learning_rate": 0.0002, "epoch": 0.8970976253298153, "step": 680}, {"loss": 1.2298, "grad_norm": 0.4091894030570984, "learning_rate": 0.0002, "epoch": 0.9102902374670184, "step": 690}, {"loss": 1.2756, "grad_norm": 0.3975585997104645, "learning_rate": 0.0002, "epoch": 0.9234828496042217, "step": 700}, {"loss": 1.281, "grad_norm": 0.3666245937347412, "learning_rate": 0.0002, "epoch": 0.9366754617414248, "step": 710}, {"loss": 1.2855, "grad_norm": 0.45216917991638184, "learning_rate": 0.0002, "epoch": 0.9498680738786279, "step": 720}, {"loss": 1.2217, "grad_norm": 0.36108118295669556, "learning_rate": 0.0002, "epoch": 0.9630606860158312, "step": 730}, {"loss": 1.17, "grad_norm": 0.44550251960754395, "learning_rate": 0.0002, "epoch": 0.9762532981530343, "step": 740}, {"loss": 1.0672, "grad_norm": 0.29801255464553833, "learning_rate": 0.0002, "epoch": 0.9894459102902374, "step": 750}]} +{"epoch": 2.0, "step": 1516, "epoch_duration": 2075.237948179245, "total_accumulated_duration": 4865.910799264908, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-758", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9466, "grad_norm": 0.7545632123947144, "learning_rate": 0.0002, "epoch": 0.013192612137203167, "step": 10}, {"loss": 1.4909, "grad_norm": 0.5787661075592041, "learning_rate": 0.0002, "epoch": 0.026385224274406333, "step": 20}, {"loss": 1.3906, "grad_norm": 0.8616093993186951, "learning_rate": 0.0002, "epoch": 0.0395778364116095, "step": 30}, {"loss": 1.4116, "grad_norm": 0.42088547348976135, "learning_rate": 0.0002, "epoch": 0.052770448548812667, "step": 40}, {"loss": 1.3446, "grad_norm": 0.47704678773880005, "learning_rate": 0.0002, "epoch": 0.06596306068601583, "step": 50}, {"loss": 1.2476, "grad_norm": 0.5763994455337524, "learning_rate": 0.0002, "epoch": 0.079155672823219, "step": 60}, {"loss": 1.2268, "grad_norm": 0.4579846262931824, "learning_rate": 0.0002, "epoch": 0.09234828496042216, "step": 70}, {"loss": 1.415, "grad_norm": 0.46623846888542175, "learning_rate": 0.0002, "epoch": 0.10554089709762533, "step": 80}, {"loss": 1.2849, "grad_norm": 0.4206956624984741, "learning_rate": 0.0002, "epoch": 0.11873350923482849, "step": 90}, {"loss": 1.1608, "grad_norm": 0.41896629333496094, "learning_rate": 0.0002, "epoch": 0.13192612137203166, "step": 100}, {"loss": 1.2617, "grad_norm": 0.3459089398384094, "learning_rate": 0.0002, "epoch": 0.14511873350923482, "step": 110}, {"loss": 1.2858, "grad_norm": 0.4587327837944031, "learning_rate": 0.0002, "epoch": 0.158311345646438, "step": 120}, {"loss": 1.2241, "grad_norm": 0.433525413274765, "learning_rate": 0.0002, "epoch": 0.17150395778364116, "step": 130}, {"loss": 1.2269, "grad_norm": 0.39253175258636475, "learning_rate": 0.0002, "epoch": 0.18469656992084432, "step": 140}, {"loss": 1.2224, "grad_norm": 0.3602290749549866, "learning_rate": 0.0002, "epoch": 0.19788918205804748, "step": 150}, {"loss": 1.2834, "grad_norm": 0.41160839796066284, "learning_rate": 0.0002, "epoch": 0.21108179419525067, "step": 160}, {"loss": 1.1986, "grad_norm": 0.7213630080223083, "learning_rate": 0.0002, "epoch": 0.22427440633245382, "step": 170}, {"loss": 1.2215, "grad_norm": 0.39086055755615234, "learning_rate": 0.0002, "epoch": 0.23746701846965698, "step": 180}, {"loss": 1.3315, "grad_norm": 0.4465520977973938, "learning_rate": 0.0002, "epoch": 0.25065963060686014, "step": 190}, {"loss": 1.3798, "grad_norm": 1.814679741859436, "learning_rate": 0.0002, "epoch": 0.2638522427440633, "step": 200}, {"loss": 1.2259, "grad_norm": 0.5026423931121826, "learning_rate": 0.0002, "epoch": 0.2770448548812665, "step": 210}, {"loss": 1.3306, "grad_norm": 0.4156292974948883, "learning_rate": 0.0002, "epoch": 0.29023746701846964, "step": 220}, {"loss": 1.266, "grad_norm": 0.40813493728637695, "learning_rate": 0.0002, "epoch": 0.3034300791556728, "step": 230}, {"loss": 1.1533, "grad_norm": 0.3304787874221802, "learning_rate": 0.0002, "epoch": 0.316622691292876, "step": 240}, {"loss": 1.3154, "grad_norm": 0.46139976382255554, "learning_rate": 0.0002, "epoch": 0.32981530343007914, "step": 250}, {"loss": 1.1365, "grad_norm": 0.37518271803855896, "learning_rate": 0.0002, "epoch": 0.34300791556728233, "step": 260}, {"loss": 1.2004, "grad_norm": 0.35586467385292053, "learning_rate": 0.0002, "epoch": 0.3562005277044855, "step": 270}, {"loss": 1.2609, "grad_norm": 0.32441186904907227, "learning_rate": 0.0002, "epoch": 0.36939313984168864, "step": 280}, {"loss": 1.212, "grad_norm": 0.3198683261871338, "learning_rate": 0.0002, "epoch": 0.38258575197889183, "step": 290}, {"loss": 1.2465, "grad_norm": 0.33663108944892883, "learning_rate": 0.0002, "epoch": 0.39577836411609496, "step": 300}, {"loss": 1.1702, "grad_norm": 0.3711244761943817, "learning_rate": 0.0002, "epoch": 0.40897097625329815, "step": 310}, {"loss": 1.0871, "grad_norm": 0.3209651708602905, "learning_rate": 0.0002, "epoch": 0.42216358839050133, "step": 320}, {"loss": 1.2728, "grad_norm": 0.5152716040611267, "learning_rate": 0.0002, "epoch": 0.43535620052770446, "step": 330}, {"loss": 1.1833, "grad_norm": 0.5431376695632935, "learning_rate": 0.0002, "epoch": 0.44854881266490765, "step": 340}, {"loss": 1.1344, "grad_norm": 0.3069997727870941, "learning_rate": 0.0002, "epoch": 0.46174142480211083, "step": 350}, {"loss": 1.2742, "grad_norm": 0.34260064363479614, "learning_rate": 0.0002, "epoch": 0.47493403693931396, "step": 360}, {"loss": 1.3161, "grad_norm": 0.345653235912323, "learning_rate": 0.0002, "epoch": 0.48812664907651715, "step": 370}, {"loss": 1.1552, "grad_norm": 0.46222734451293945, "learning_rate": 0.0002, "epoch": 0.5013192612137203, "step": 380}, {"loss": 1.0616, "grad_norm": 0.27301734685897827, "learning_rate": 0.0002, "epoch": 0.5145118733509235, "step": 390}, {"loss": 1.1944, "grad_norm": 0.29048439860343933, "learning_rate": 0.0002, "epoch": 0.5277044854881267, "step": 400}, {"loss": 1.0956, "grad_norm": 0.32927802205085754, "learning_rate": 0.0002, "epoch": 0.5408970976253298, "step": 410}, {"loss": 1.2362, "grad_norm": 0.3336397409439087, "learning_rate": 0.0002, "epoch": 0.554089709762533, "step": 420}, {"loss": 1.1445, "grad_norm": 0.4007597267627716, "learning_rate": 0.0002, "epoch": 0.5672823218997362, "step": 430}, {"loss": 1.2731, "grad_norm": 0.36144956946372986, "learning_rate": 0.0002, "epoch": 0.5804749340369393, "step": 440}, {"loss": 1.1604, "grad_norm": 0.6331009864807129, "learning_rate": 0.0002, "epoch": 0.5936675461741425, "step": 450}, {"loss": 1.1692, "grad_norm": 0.41469088196754456, "learning_rate": 0.0002, "epoch": 0.6068601583113457, "step": 460}, {"loss": 1.2372, "grad_norm": 0.4388185143470764, "learning_rate": 0.0002, "epoch": 0.6200527704485488, "step": 470}, {"loss": 1.2541, "grad_norm": 0.3738141655921936, "learning_rate": 0.0002, "epoch": 0.633245382585752, "step": 480}, {"loss": 1.1265, "grad_norm": 0.7212023138999939, "learning_rate": 0.0002, "epoch": 0.6464379947229552, "step": 490}, {"loss": 1.2786, "grad_norm": 0.2972351014614105, "learning_rate": 0.0002, "epoch": 0.6596306068601583, "step": 500}, {"loss": 1.1739, "grad_norm": 0.45293179154396057, "learning_rate": 0.0002, "epoch": 0.6728232189973615, "step": 510}, {"loss": 1.255, "grad_norm": 0.4319860637187958, "learning_rate": 0.0002, "epoch": 0.6860158311345647, "step": 520}, {"loss": 1.0959, "grad_norm": 0.3050215542316437, "learning_rate": 0.0002, "epoch": 0.6992084432717678, "step": 530}, {"loss": 1.1608, "grad_norm": 0.3552611172199249, "learning_rate": 0.0002, "epoch": 0.712401055408971, "step": 540}, {"loss": 1.2546, "grad_norm": 0.3631151020526886, "learning_rate": 0.0002, "epoch": 0.7255936675461742, "step": 550}, {"loss": 1.1371, "grad_norm": 0.28177931904792786, "learning_rate": 0.0002, "epoch": 0.7387862796833773, "step": 560}, {"loss": 1.2081, "grad_norm": 0.359764039516449, "learning_rate": 0.0002, "epoch": 0.7519788918205804, "step": 570}, {"loss": 1.1356, "grad_norm": 0.3970327377319336, "learning_rate": 0.0002, "epoch": 0.7651715039577837, "step": 580}, {"loss": 1.309, "grad_norm": 0.3541001081466675, "learning_rate": 0.0002, "epoch": 0.7783641160949868, "step": 590}, {"loss": 1.2061, "grad_norm": 0.3478573262691498, "learning_rate": 0.0002, "epoch": 0.7915567282321899, "step": 600}, {"loss": 1.1864, "grad_norm": 0.3900321424007416, "learning_rate": 0.0002, "epoch": 0.8047493403693932, "step": 610}, {"loss": 1.1358, "grad_norm": 0.3443238437175751, "learning_rate": 0.0002, "epoch": 0.8179419525065963, "step": 620}, {"loss": 1.3232, "grad_norm": 0.44238781929016113, "learning_rate": 0.0002, "epoch": 0.8311345646437994, "step": 630}, {"loss": 1.1247, "grad_norm": 0.36339467763900757, "learning_rate": 0.0002, "epoch": 0.8443271767810027, "step": 640}, {"loss": 1.1049, "grad_norm": 0.6243070363998413, "learning_rate": 0.0002, "epoch": 0.8575197889182058, "step": 650}, {"loss": 1.1943, "grad_norm": 0.3209173381328583, "learning_rate": 0.0002, "epoch": 0.8707124010554089, "step": 660}, {"loss": 1.1927, "grad_norm": 0.35017991065979004, "learning_rate": 0.0002, "epoch": 0.8839050131926122, "step": 670}, {"loss": 1.1603, "grad_norm": 0.3247159421443939, "learning_rate": 0.0002, "epoch": 0.8970976253298153, "step": 680}, {"loss": 1.2298, "grad_norm": 0.4091894030570984, "learning_rate": 0.0002, "epoch": 0.9102902374670184, "step": 690}, {"loss": 1.2756, "grad_norm": 0.3975585997104645, "learning_rate": 0.0002, "epoch": 0.9234828496042217, "step": 700}, {"loss": 1.281, "grad_norm": 0.3666245937347412, "learning_rate": 0.0002, "epoch": 0.9366754617414248, "step": 710}, {"loss": 1.2855, "grad_norm": 0.45216917991638184, "learning_rate": 0.0002, "epoch": 0.9498680738786279, "step": 720}, {"loss": 1.2217, "grad_norm": 0.36108118295669556, "learning_rate": 0.0002, "epoch": 0.9630606860158312, "step": 730}, {"loss": 1.17, "grad_norm": 0.44550251960754395, "learning_rate": 0.0002, "epoch": 0.9762532981530343, "step": 740}, {"loss": 1.0672, "grad_norm": 0.29801255464553833, "learning_rate": 0.0002, "epoch": 0.9894459102902374, "step": 750}, {"eval_loss": 1.203244686126709, "eval_runtime": 76.0457, "eval_samples_per_second": 5.668, "eval_steps_per_second": 0.71, "epoch": 1.0, "step": 758}, {"loss": 1.0748, "grad_norm": 0.4096551239490509, "learning_rate": 0.0002, "epoch": 1.0026385224274406, "step": 760}, {"loss": 1.1537, "grad_norm": 0.2649582326412201, "learning_rate": 0.0002, "epoch": 1.0158311345646438, "step": 770}, {"loss": 1.1676, "grad_norm": 0.3100722134113312, "learning_rate": 0.0002, "epoch": 1.029023746701847, "step": 780}, {"loss": 1.1736, "grad_norm": 0.3911755383014679, "learning_rate": 0.0002, "epoch": 1.04221635883905, "step": 790}, {"loss": 1.0354, "grad_norm": 0.4600953757762909, "learning_rate": 0.0002, "epoch": 1.0554089709762533, "step": 800}, {"loss": 1.2869, "grad_norm": 0.28671619296073914, "learning_rate": 0.0002, "epoch": 1.0686015831134565, "step": 810}, {"loss": 1.1441, "grad_norm": 0.47282642126083374, "learning_rate": 0.0002, "epoch": 1.0817941952506596, "step": 820}, {"loss": 1.1368, "grad_norm": 0.690073549747467, "learning_rate": 0.0002, "epoch": 1.0949868073878628, "step": 830}, {"loss": 0.9944, "grad_norm": 0.7317902445793152, "learning_rate": 0.0002, "epoch": 1.108179419525066, "step": 840}, {"loss": 1.108, "grad_norm": 0.44215938448905945, "learning_rate": 0.0002, "epoch": 1.121372031662269, "step": 850}, {"loss": 1.0558, "grad_norm": 0.33875149488449097, "learning_rate": 0.0002, "epoch": 1.1345646437994723, "step": 860}, {"loss": 1.0471, "grad_norm": 0.3700002431869507, "learning_rate": 0.0002, "epoch": 1.1477572559366755, "step": 870}, {"loss": 1.0121, "grad_norm": 0.41173291206359863, "learning_rate": 0.0002, "epoch": 1.1609498680738786, "step": 880}, {"loss": 1.0312, "grad_norm": 0.5253589749336243, "learning_rate": 0.0002, "epoch": 1.1741424802110818, "step": 890}, {"loss": 1.2238, "grad_norm": 0.3912237286567688, "learning_rate": 0.0002, "epoch": 1.187335092348285, "step": 900}, {"loss": 0.9434, "grad_norm": 0.40990331768989563, "learning_rate": 0.0002, "epoch": 1.200527704485488, "step": 910}, {"loss": 1.0493, "grad_norm": 0.40377968549728394, "learning_rate": 0.0002, "epoch": 1.2137203166226913, "step": 920}, {"loss": 1.1496, "grad_norm": 0.4605846405029297, "learning_rate": 0.0002, "epoch": 1.2269129287598945, "step": 930}, {"loss": 1.1173, "grad_norm": 0.31564897298812866, "learning_rate": 0.0002, "epoch": 1.2401055408970976, "step": 940}, {"loss": 1.0547, "grad_norm": 0.39808550477027893, "learning_rate": 0.0002, "epoch": 1.2532981530343008, "step": 950}, {"loss": 1.1367, "grad_norm": 0.3762115240097046, "learning_rate": 0.0002, "epoch": 1.266490765171504, "step": 960}, {"loss": 1.1596, "grad_norm": 0.4174984097480774, "learning_rate": 0.0002, "epoch": 1.279683377308707, "step": 970}, {"loss": 1.1327, "grad_norm": 0.5263054966926575, "learning_rate": 0.0002, "epoch": 1.2928759894459103, "step": 980}, {"loss": 1.0339, "grad_norm": 0.41673699021339417, "learning_rate": 0.0002, "epoch": 1.3060686015831133, "step": 990}, {"loss": 1.1198, "grad_norm": 0.9613684415817261, "learning_rate": 0.0002, "epoch": 1.3192612137203166, "step": 1000}, {"loss": 1.0444, "grad_norm": 0.3690216839313507, "learning_rate": 0.0002, "epoch": 1.3324538258575198, "step": 1010}, {"loss": 1.0473, "grad_norm": 0.521821141242981, "learning_rate": 0.0002, "epoch": 1.345646437994723, "step": 1020}, {"loss": 1.1065, "grad_norm": 0.3353094160556793, "learning_rate": 0.0002, "epoch": 1.358839050131926, "step": 1030}, {"loss": 1.1286, "grad_norm": 0.3843843936920166, "learning_rate": 0.0002, "epoch": 1.3720316622691293, "step": 1040}, {"loss": 1.1369, "grad_norm": 0.372514545917511, "learning_rate": 0.0002, "epoch": 1.3852242744063323, "step": 1050}, {"loss": 1.0041, "grad_norm": 0.34537771344184875, "learning_rate": 0.0002, "epoch": 1.3984168865435356, "step": 1060}, {"loss": 1.0251, "grad_norm": 0.45349085330963135, "learning_rate": 0.0002, "epoch": 1.4116094986807388, "step": 1070}, {"loss": 1.1873, "grad_norm": 0.5120177268981934, "learning_rate": 0.0002, "epoch": 1.424802110817942, "step": 1080}, {"loss": 1.0872, "grad_norm": 0.42800238728523254, "learning_rate": 0.0002, "epoch": 1.437994722955145, "step": 1090}, {"loss": 1.0734, "grad_norm": 0.343832790851593, "learning_rate": 0.0002, "epoch": 1.4511873350923483, "step": 1100}, {"loss": 1.1286, "grad_norm": 0.3829841911792755, "learning_rate": 0.0002, "epoch": 1.4643799472295513, "step": 1110}, {"loss": 1.1268, "grad_norm": 0.4289931058883667, "learning_rate": 0.0002, "epoch": 1.4775725593667546, "step": 1120}, {"loss": 1.0676, "grad_norm": 0.42750850319862366, "learning_rate": 0.0002, "epoch": 1.4907651715039578, "step": 1130}, {"loss": 1.072, "grad_norm": 0.34328413009643555, "learning_rate": 0.0002, "epoch": 1.503957783641161, "step": 1140}, {"loss": 1.0863, "grad_norm": 0.349096417427063, "learning_rate": 0.0002, "epoch": 1.517150395778364, "step": 1150}, {"loss": 1.1462, "grad_norm": 0.7700717449188232, "learning_rate": 0.0002, "epoch": 1.5303430079155673, "step": 1160}, {"loss": 0.9528, "grad_norm": 0.39294949173927307, "learning_rate": 0.0002, "epoch": 1.5435356200527703, "step": 1170}, {"loss": 1.1603, "grad_norm": 0.36173608899116516, "learning_rate": 0.0002, "epoch": 1.5567282321899736, "step": 1180}, {"loss": 1.1508, "grad_norm": 0.6034277677536011, "learning_rate": 0.0002, "epoch": 1.5699208443271768, "step": 1190}, {"loss": 1.0105, "grad_norm": 0.36694103479385376, "learning_rate": 0.0002, "epoch": 1.58311345646438, "step": 1200}, {"loss": 1.1479, "grad_norm": 0.4727209508419037, "learning_rate": 0.0002, "epoch": 1.596306068601583, "step": 1210}, {"loss": 1.0689, "grad_norm": 0.6482883095741272, "learning_rate": 0.0002, "epoch": 1.6094986807387863, "step": 1220}, {"loss": 1.1405, "grad_norm": 0.5238035917282104, "learning_rate": 0.0002, "epoch": 1.6226912928759893, "step": 1230}, {"loss": 1.0596, "grad_norm": 0.4812222421169281, "learning_rate": 0.0002, "epoch": 1.6358839050131926, "step": 1240}, {"loss": 1.2729, "grad_norm": 0.7131702303886414, "learning_rate": 0.0002, "epoch": 1.6490765171503958, "step": 1250}, {"loss": 0.9832, "grad_norm": 0.3803327977657318, "learning_rate": 0.0002, "epoch": 1.662269129287599, "step": 1260}, {"loss": 1.1433, "grad_norm": 0.3745088577270508, "learning_rate": 0.0002, "epoch": 1.675461741424802, "step": 1270}, {"loss": 1.1018, "grad_norm": 0.4427378475666046, "learning_rate": 0.0002, "epoch": 1.6886543535620053, "step": 1280}, {"loss": 1.0619, "grad_norm": 0.797478973865509, "learning_rate": 0.0002, "epoch": 1.7018469656992083, "step": 1290}, {"loss": 1.0853, "grad_norm": 0.503620982170105, "learning_rate": 0.0002, "epoch": 1.7150395778364116, "step": 1300}, {"loss": 1.1324, "grad_norm": 0.4132426381111145, "learning_rate": 0.0002, "epoch": 1.7282321899736148, "step": 1310}, {"loss": 1.172, "grad_norm": 0.41811656951904297, "learning_rate": 0.0002, "epoch": 1.741424802110818, "step": 1320}, {"loss": 1.0903, "grad_norm": 0.40647849440574646, "learning_rate": 0.0002, "epoch": 1.754617414248021, "step": 1330}, {"loss": 1.1316, "grad_norm": 0.42138347029685974, "learning_rate": 0.0002, "epoch": 1.767810026385224, "step": 1340}, {"loss": 0.9955, "grad_norm": 0.46523579955101013, "learning_rate": 0.0002, "epoch": 1.7810026385224274, "step": 1350}, {"loss": 1.0307, "grad_norm": 0.39760419726371765, "learning_rate": 0.0002, "epoch": 1.7941952506596306, "step": 1360}, {"loss": 1.1218, "grad_norm": 0.37993717193603516, "learning_rate": 0.0002, "epoch": 1.8073878627968338, "step": 1370}, {"loss": 1.0921, "grad_norm": 0.5404181480407715, "learning_rate": 0.0002, "epoch": 1.820580474934037, "step": 1380}, {"loss": 1.0903, "grad_norm": 0.4385245740413666, "learning_rate": 0.0002, "epoch": 1.83377308707124, "step": 1390}, {"loss": 1.1175, "grad_norm": 0.529797375202179, "learning_rate": 0.0002, "epoch": 1.8469656992084431, "step": 1400}, {"loss": 1.0577, "grad_norm": 0.481567919254303, "learning_rate": 0.0002, "epoch": 1.8601583113456464, "step": 1410}, {"loss": 1.0581, "grad_norm": 0.34787362813949585, "learning_rate": 0.0002, "epoch": 1.8733509234828496, "step": 1420}, {"loss": 1.0536, "grad_norm": 0.6402362585067749, "learning_rate": 0.0002, "epoch": 1.8865435356200528, "step": 1430}, {"loss": 1.0787, "grad_norm": 0.3461322784423828, "learning_rate": 0.0002, "epoch": 1.899736147757256, "step": 1440}, {"loss": 1.0925, "grad_norm": 0.44005653262138367, "learning_rate": 0.0002, "epoch": 1.912928759894459, "step": 1450}, {"loss": 1.0414, "grad_norm": 0.4064280688762665, "learning_rate": 0.0002, "epoch": 1.9261213720316621, "step": 1460}, {"loss": 1.0608, "grad_norm": 0.5236523151397705, "learning_rate": 0.0002, "epoch": 1.9393139841688654, "step": 1470}, {"loss": 1.0572, "grad_norm": 0.41030219197273254, "learning_rate": 0.0002, "epoch": 1.9525065963060686, "step": 1480}, {"loss": 1.1204, "grad_norm": 0.39805835485458374, "learning_rate": 0.0002, "epoch": 1.9656992084432718, "step": 1490}, {"loss": 1.0364, "grad_norm": 0.42974501848220825, "learning_rate": 0.0002, "epoch": 1.978891820580475, "step": 1500}, {"loss": 1.1169, "grad_norm": 0.4688243865966797, "learning_rate": 0.0002, "epoch": 1.992084432717678, "step": 1510}]} +{"epoch": 3.0, "step": 2274, "epoch_duration": 2076.580257177353, "total_accumulated_duration": 6942.491056442261, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9466, "grad_norm": 0.7545632123947144, "learning_rate": 0.0002, "epoch": 0.013192612137203167, "step": 10}, {"loss": 1.4909, "grad_norm": 0.5787661075592041, "learning_rate": 0.0002, "epoch": 0.026385224274406333, "step": 20}, {"loss": 1.3906, "grad_norm": 0.8616093993186951, "learning_rate": 0.0002, "epoch": 0.0395778364116095, "step": 30}, {"loss": 1.4116, "grad_norm": 0.42088547348976135, "learning_rate": 0.0002, "epoch": 0.052770448548812667, "step": 40}, {"loss": 1.3446, "grad_norm": 0.47704678773880005, "learning_rate": 0.0002, "epoch": 0.06596306068601583, "step": 50}, {"loss": 1.2476, "grad_norm": 0.5763994455337524, "learning_rate": 0.0002, "epoch": 0.079155672823219, "step": 60}, {"loss": 1.2268, "grad_norm": 0.4579846262931824, "learning_rate": 0.0002, "epoch": 0.09234828496042216, "step": 70}, {"loss": 1.415, "grad_norm": 0.46623846888542175, "learning_rate": 0.0002, "epoch": 0.10554089709762533, "step": 80}, {"loss": 1.2849, "grad_norm": 0.4206956624984741, "learning_rate": 0.0002, "epoch": 0.11873350923482849, "step": 90}, {"loss": 1.1608, "grad_norm": 0.41896629333496094, "learning_rate": 0.0002, "epoch": 0.13192612137203166, "step": 100}, {"loss": 1.2617, "grad_norm": 0.3459089398384094, "learning_rate": 0.0002, "epoch": 0.14511873350923482, "step": 110}, {"loss": 1.2858, "grad_norm": 0.4587327837944031, "learning_rate": 0.0002, "epoch": 0.158311345646438, "step": 120}, {"loss": 1.2241, "grad_norm": 0.433525413274765, "learning_rate": 0.0002, "epoch": 0.17150395778364116, "step": 130}, {"loss": 1.2269, "grad_norm": 0.39253175258636475, "learning_rate": 0.0002, "epoch": 0.18469656992084432, "step": 140}, {"loss": 1.2224, "grad_norm": 0.3602290749549866, "learning_rate": 0.0002, "epoch": 0.19788918205804748, "step": 150}, {"loss": 1.2834, "grad_norm": 0.41160839796066284, "learning_rate": 0.0002, "epoch": 0.21108179419525067, "step": 160}, {"loss": 1.1986, "grad_norm": 0.7213630080223083, "learning_rate": 0.0002, "epoch": 0.22427440633245382, "step": 170}, {"loss": 1.2215, "grad_norm": 0.39086055755615234, "learning_rate": 0.0002, "epoch": 0.23746701846965698, "step": 180}, {"loss": 1.3315, "grad_norm": 0.4465520977973938, "learning_rate": 0.0002, "epoch": 0.25065963060686014, "step": 190}, {"loss": 1.3798, "grad_norm": 1.814679741859436, "learning_rate": 0.0002, "epoch": 0.2638522427440633, "step": 200}, {"loss": 1.2259, "grad_norm": 0.5026423931121826, "learning_rate": 0.0002, "epoch": 0.2770448548812665, "step": 210}, {"loss": 1.3306, "grad_norm": 0.4156292974948883, "learning_rate": 0.0002, "epoch": 0.29023746701846964, "step": 220}, {"loss": 1.266, "grad_norm": 0.40813493728637695, "learning_rate": 0.0002, "epoch": 0.3034300791556728, "step": 230}, {"loss": 1.1533, "grad_norm": 0.3304787874221802, "learning_rate": 0.0002, "epoch": 0.316622691292876, "step": 240}, {"loss": 1.3154, "grad_norm": 0.46139976382255554, "learning_rate": 0.0002, "epoch": 0.32981530343007914, "step": 250}, {"loss": 1.1365, "grad_norm": 0.37518271803855896, "learning_rate": 0.0002, "epoch": 0.34300791556728233, "step": 260}, {"loss": 1.2004, "grad_norm": 0.35586467385292053, "learning_rate": 0.0002, "epoch": 0.3562005277044855, "step": 270}, {"loss": 1.2609, "grad_norm": 0.32441186904907227, "learning_rate": 0.0002, "epoch": 0.36939313984168864, "step": 280}, {"loss": 1.212, "grad_norm": 0.3198683261871338, "learning_rate": 0.0002, "epoch": 0.38258575197889183, "step": 290}, {"loss": 1.2465, "grad_norm": 0.33663108944892883, "learning_rate": 0.0002, "epoch": 0.39577836411609496, "step": 300}, {"loss": 1.1702, "grad_norm": 0.3711244761943817, "learning_rate": 0.0002, "epoch": 0.40897097625329815, "step": 310}, {"loss": 1.0871, "grad_norm": 0.3209651708602905, "learning_rate": 0.0002, "epoch": 0.42216358839050133, "step": 320}, {"loss": 1.2728, "grad_norm": 0.5152716040611267, "learning_rate": 0.0002, "epoch": 0.43535620052770446, "step": 330}, {"loss": 1.1833, "grad_norm": 0.5431376695632935, "learning_rate": 0.0002, "epoch": 0.44854881266490765, "step": 340}, {"loss": 1.1344, "grad_norm": 0.3069997727870941, "learning_rate": 0.0002, "epoch": 0.46174142480211083, "step": 350}, {"loss": 1.2742, "grad_norm": 0.34260064363479614, "learning_rate": 0.0002, "epoch": 0.47493403693931396, "step": 360}, {"loss": 1.3161, "grad_norm": 0.345653235912323, "learning_rate": 0.0002, "epoch": 0.48812664907651715, "step": 370}, {"loss": 1.1552, "grad_norm": 0.46222734451293945, "learning_rate": 0.0002, "epoch": 0.5013192612137203, "step": 380}, {"loss": 1.0616, "grad_norm": 0.27301734685897827, "learning_rate": 0.0002, "epoch": 0.5145118733509235, "step": 390}, {"loss": 1.1944, "grad_norm": 0.29048439860343933, "learning_rate": 0.0002, "epoch": 0.5277044854881267, "step": 400}, {"loss": 1.0956, "grad_norm": 0.32927802205085754, "learning_rate": 0.0002, "epoch": 0.5408970976253298, "step": 410}, {"loss": 1.2362, "grad_norm": 0.3336397409439087, "learning_rate": 0.0002, "epoch": 0.554089709762533, "step": 420}, {"loss": 1.1445, "grad_norm": 0.4007597267627716, "learning_rate": 0.0002, "epoch": 0.5672823218997362, "step": 430}, {"loss": 1.2731, "grad_norm": 0.36144956946372986, "learning_rate": 0.0002, "epoch": 0.5804749340369393, "step": 440}, {"loss": 1.1604, "grad_norm": 0.6331009864807129, "learning_rate": 0.0002, "epoch": 0.5936675461741425, "step": 450}, {"loss": 1.1692, "grad_norm": 0.41469088196754456, "learning_rate": 0.0002, "epoch": 0.6068601583113457, "step": 460}, {"loss": 1.2372, "grad_norm": 0.4388185143470764, "learning_rate": 0.0002, "epoch": 0.6200527704485488, "step": 470}, {"loss": 1.2541, "grad_norm": 0.3738141655921936, "learning_rate": 0.0002, "epoch": 0.633245382585752, "step": 480}, {"loss": 1.1265, "grad_norm": 0.7212023138999939, "learning_rate": 0.0002, "epoch": 0.6464379947229552, "step": 490}, {"loss": 1.2786, "grad_norm": 0.2972351014614105, "learning_rate": 0.0002, "epoch": 0.6596306068601583, "step": 500}, {"loss": 1.1739, "grad_norm": 0.45293179154396057, "learning_rate": 0.0002, "epoch": 0.6728232189973615, "step": 510}, {"loss": 1.255, "grad_norm": 0.4319860637187958, "learning_rate": 0.0002, "epoch": 0.6860158311345647, "step": 520}, {"loss": 1.0959, "grad_norm": 0.3050215542316437, "learning_rate": 0.0002, "epoch": 0.6992084432717678, "step": 530}, {"loss": 1.1608, "grad_norm": 0.3552611172199249, "learning_rate": 0.0002, "epoch": 0.712401055408971, "step": 540}, {"loss": 1.2546, "grad_norm": 0.3631151020526886, "learning_rate": 0.0002, "epoch": 0.7255936675461742, "step": 550}, {"loss": 1.1371, "grad_norm": 0.28177931904792786, "learning_rate": 0.0002, "epoch": 0.7387862796833773, "step": 560}, {"loss": 1.2081, "grad_norm": 0.359764039516449, "learning_rate": 0.0002, "epoch": 0.7519788918205804, "step": 570}, {"loss": 1.1356, "grad_norm": 0.3970327377319336, "learning_rate": 0.0002, "epoch": 0.7651715039577837, "step": 580}, {"loss": 1.309, "grad_norm": 0.3541001081466675, "learning_rate": 0.0002, "epoch": 0.7783641160949868, "step": 590}, {"loss": 1.2061, "grad_norm": 0.3478573262691498, "learning_rate": 0.0002, "epoch": 0.7915567282321899, "step": 600}, {"loss": 1.1864, "grad_norm": 0.3900321424007416, "learning_rate": 0.0002, "epoch": 0.8047493403693932, "step": 610}, {"loss": 1.1358, "grad_norm": 0.3443238437175751, "learning_rate": 0.0002, "epoch": 0.8179419525065963, "step": 620}, {"loss": 1.3232, "grad_norm": 0.44238781929016113, "learning_rate": 0.0002, "epoch": 0.8311345646437994, "step": 630}, {"loss": 1.1247, "grad_norm": 0.36339467763900757, "learning_rate": 0.0002, "epoch": 0.8443271767810027, "step": 640}, {"loss": 1.1049, "grad_norm": 0.6243070363998413, "learning_rate": 0.0002, "epoch": 0.8575197889182058, "step": 650}, {"loss": 1.1943, "grad_norm": 0.3209173381328583, "learning_rate": 0.0002, "epoch": 0.8707124010554089, "step": 660}, {"loss": 1.1927, "grad_norm": 0.35017991065979004, "learning_rate": 0.0002, "epoch": 0.8839050131926122, "step": 670}, {"loss": 1.1603, "grad_norm": 0.3247159421443939, "learning_rate": 0.0002, "epoch": 0.8970976253298153, "step": 680}, {"loss": 1.2298, "grad_norm": 0.4091894030570984, "learning_rate": 0.0002, "epoch": 0.9102902374670184, "step": 690}, {"loss": 1.2756, "grad_norm": 0.3975585997104645, "learning_rate": 0.0002, "epoch": 0.9234828496042217, "step": 700}, {"loss": 1.281, "grad_norm": 0.3666245937347412, "learning_rate": 0.0002, "epoch": 0.9366754617414248, "step": 710}, {"loss": 1.2855, "grad_norm": 0.45216917991638184, "learning_rate": 0.0002, "epoch": 0.9498680738786279, "step": 720}, {"loss": 1.2217, "grad_norm": 0.36108118295669556, "learning_rate": 0.0002, "epoch": 0.9630606860158312, "step": 730}, {"loss": 1.17, "grad_norm": 0.44550251960754395, "learning_rate": 0.0002, "epoch": 0.9762532981530343, "step": 740}, {"loss": 1.0672, "grad_norm": 0.29801255464553833, "learning_rate": 0.0002, "epoch": 0.9894459102902374, "step": 750}, {"eval_loss": 1.203244686126709, "eval_runtime": 76.0457, "eval_samples_per_second": 5.668, "eval_steps_per_second": 0.71, "epoch": 1.0, "step": 758}, {"loss": 1.0748, "grad_norm": 0.4096551239490509, "learning_rate": 0.0002, "epoch": 1.0026385224274406, "step": 760}, {"loss": 1.1537, "grad_norm": 0.2649582326412201, "learning_rate": 0.0002, "epoch": 1.0158311345646438, "step": 770}, {"loss": 1.1676, "grad_norm": 0.3100722134113312, "learning_rate": 0.0002, "epoch": 1.029023746701847, "step": 780}, {"loss": 1.1736, "grad_norm": 0.3911755383014679, "learning_rate": 0.0002, "epoch": 1.04221635883905, "step": 790}, {"loss": 1.0354, "grad_norm": 0.4600953757762909, "learning_rate": 0.0002, "epoch": 1.0554089709762533, "step": 800}, {"loss": 1.2869, "grad_norm": 0.28671619296073914, "learning_rate": 0.0002, "epoch": 1.0686015831134565, "step": 810}, {"loss": 1.1441, "grad_norm": 0.47282642126083374, "learning_rate": 0.0002, "epoch": 1.0817941952506596, "step": 820}, {"loss": 1.1368, "grad_norm": 0.690073549747467, "learning_rate": 0.0002, "epoch": 1.0949868073878628, "step": 830}, {"loss": 0.9944, "grad_norm": 0.7317902445793152, "learning_rate": 0.0002, "epoch": 1.108179419525066, "step": 840}, {"loss": 1.108, "grad_norm": 0.44215938448905945, "learning_rate": 0.0002, "epoch": 1.121372031662269, "step": 850}, {"loss": 1.0558, "grad_norm": 0.33875149488449097, "learning_rate": 0.0002, "epoch": 1.1345646437994723, "step": 860}, {"loss": 1.0471, "grad_norm": 0.3700002431869507, "learning_rate": 0.0002, "epoch": 1.1477572559366755, "step": 870}, {"loss": 1.0121, "grad_norm": 0.41173291206359863, "learning_rate": 0.0002, "epoch": 1.1609498680738786, "step": 880}, {"loss": 1.0312, "grad_norm": 0.5253589749336243, "learning_rate": 0.0002, "epoch": 1.1741424802110818, "step": 890}, {"loss": 1.2238, "grad_norm": 0.3912237286567688, "learning_rate": 0.0002, "epoch": 1.187335092348285, "step": 900}, {"loss": 0.9434, "grad_norm": 0.40990331768989563, "learning_rate": 0.0002, "epoch": 1.200527704485488, "step": 910}, {"loss": 1.0493, "grad_norm": 0.40377968549728394, "learning_rate": 0.0002, "epoch": 1.2137203166226913, "step": 920}, {"loss": 1.1496, "grad_norm": 0.4605846405029297, "learning_rate": 0.0002, "epoch": 1.2269129287598945, "step": 930}, {"loss": 1.1173, "grad_norm": 0.31564897298812866, "learning_rate": 0.0002, "epoch": 1.2401055408970976, "step": 940}, {"loss": 1.0547, "grad_norm": 0.39808550477027893, "learning_rate": 0.0002, "epoch": 1.2532981530343008, "step": 950}, {"loss": 1.1367, "grad_norm": 0.3762115240097046, "learning_rate": 0.0002, "epoch": 1.266490765171504, "step": 960}, {"loss": 1.1596, "grad_norm": 0.4174984097480774, "learning_rate": 0.0002, "epoch": 1.279683377308707, "step": 970}, {"loss": 1.1327, "grad_norm": 0.5263054966926575, "learning_rate": 0.0002, "epoch": 1.2928759894459103, "step": 980}, {"loss": 1.0339, "grad_norm": 0.41673699021339417, "learning_rate": 0.0002, "epoch": 1.3060686015831133, "step": 990}, {"loss": 1.1198, "grad_norm": 0.9613684415817261, "learning_rate": 0.0002, "epoch": 1.3192612137203166, "step": 1000}, {"loss": 1.0444, "grad_norm": 0.3690216839313507, "learning_rate": 0.0002, "epoch": 1.3324538258575198, "step": 1010}, {"loss": 1.0473, "grad_norm": 0.521821141242981, "learning_rate": 0.0002, "epoch": 1.345646437994723, "step": 1020}, {"loss": 1.1065, "grad_norm": 0.3353094160556793, "learning_rate": 0.0002, "epoch": 1.358839050131926, "step": 1030}, {"loss": 1.1286, "grad_norm": 0.3843843936920166, "learning_rate": 0.0002, "epoch": 1.3720316622691293, "step": 1040}, {"loss": 1.1369, "grad_norm": 0.372514545917511, "learning_rate": 0.0002, "epoch": 1.3852242744063323, "step": 1050}, {"loss": 1.0041, "grad_norm": 0.34537771344184875, "learning_rate": 0.0002, "epoch": 1.3984168865435356, "step": 1060}, {"loss": 1.0251, "grad_norm": 0.45349085330963135, "learning_rate": 0.0002, "epoch": 1.4116094986807388, "step": 1070}, {"loss": 1.1873, "grad_norm": 0.5120177268981934, "learning_rate": 0.0002, "epoch": 1.424802110817942, "step": 1080}, {"loss": 1.0872, "grad_norm": 0.42800238728523254, "learning_rate": 0.0002, "epoch": 1.437994722955145, "step": 1090}, {"loss": 1.0734, "grad_norm": 0.343832790851593, "learning_rate": 0.0002, "epoch": 1.4511873350923483, "step": 1100}, {"loss": 1.1286, "grad_norm": 0.3829841911792755, "learning_rate": 0.0002, "epoch": 1.4643799472295513, "step": 1110}, {"loss": 1.1268, "grad_norm": 0.4289931058883667, "learning_rate": 0.0002, "epoch": 1.4775725593667546, "step": 1120}, {"loss": 1.0676, "grad_norm": 0.42750850319862366, "learning_rate": 0.0002, "epoch": 1.4907651715039578, "step": 1130}, {"loss": 1.072, "grad_norm": 0.34328413009643555, "learning_rate": 0.0002, "epoch": 1.503957783641161, "step": 1140}, {"loss": 1.0863, "grad_norm": 0.349096417427063, "learning_rate": 0.0002, "epoch": 1.517150395778364, "step": 1150}, {"loss": 1.1462, "grad_norm": 0.7700717449188232, "learning_rate": 0.0002, "epoch": 1.5303430079155673, "step": 1160}, {"loss": 0.9528, "grad_norm": 0.39294949173927307, "learning_rate": 0.0002, "epoch": 1.5435356200527703, "step": 1170}, {"loss": 1.1603, "grad_norm": 0.36173608899116516, "learning_rate": 0.0002, "epoch": 1.5567282321899736, "step": 1180}, {"loss": 1.1508, "grad_norm": 0.6034277677536011, "learning_rate": 0.0002, "epoch": 1.5699208443271768, "step": 1190}, {"loss": 1.0105, "grad_norm": 0.36694103479385376, "learning_rate": 0.0002, "epoch": 1.58311345646438, "step": 1200}, {"loss": 1.1479, "grad_norm": 0.4727209508419037, "learning_rate": 0.0002, "epoch": 1.596306068601583, "step": 1210}, {"loss": 1.0689, "grad_norm": 0.6482883095741272, "learning_rate": 0.0002, "epoch": 1.6094986807387863, "step": 1220}, {"loss": 1.1405, "grad_norm": 0.5238035917282104, "learning_rate": 0.0002, "epoch": 1.6226912928759893, "step": 1230}, {"loss": 1.0596, "grad_norm": 0.4812222421169281, "learning_rate": 0.0002, "epoch": 1.6358839050131926, "step": 1240}, {"loss": 1.2729, "grad_norm": 0.7131702303886414, "learning_rate": 0.0002, "epoch": 1.6490765171503958, "step": 1250}, {"loss": 0.9832, "grad_norm": 0.3803327977657318, "learning_rate": 0.0002, "epoch": 1.662269129287599, "step": 1260}, {"loss": 1.1433, "grad_norm": 0.3745088577270508, "learning_rate": 0.0002, "epoch": 1.675461741424802, "step": 1270}, {"loss": 1.1018, "grad_norm": 0.4427378475666046, "learning_rate": 0.0002, "epoch": 1.6886543535620053, "step": 1280}, {"loss": 1.0619, "grad_norm": 0.797478973865509, "learning_rate": 0.0002, "epoch": 1.7018469656992083, "step": 1290}, {"loss": 1.0853, "grad_norm": 0.503620982170105, "learning_rate": 0.0002, "epoch": 1.7150395778364116, "step": 1300}, {"loss": 1.1324, "grad_norm": 0.4132426381111145, "learning_rate": 0.0002, "epoch": 1.7282321899736148, "step": 1310}, {"loss": 1.172, "grad_norm": 0.41811656951904297, "learning_rate": 0.0002, "epoch": 1.741424802110818, "step": 1320}, {"loss": 1.0903, "grad_norm": 0.40647849440574646, "learning_rate": 0.0002, "epoch": 1.754617414248021, "step": 1330}, {"loss": 1.1316, "grad_norm": 0.42138347029685974, "learning_rate": 0.0002, "epoch": 1.767810026385224, "step": 1340}, {"loss": 0.9955, "grad_norm": 0.46523579955101013, "learning_rate": 0.0002, "epoch": 1.7810026385224274, "step": 1350}, {"loss": 1.0307, "grad_norm": 0.39760419726371765, "learning_rate": 0.0002, "epoch": 1.7941952506596306, "step": 1360}, {"loss": 1.1218, "grad_norm": 0.37993717193603516, "learning_rate": 0.0002, "epoch": 1.8073878627968338, "step": 1370}, {"loss": 1.0921, "grad_norm": 0.5404181480407715, "learning_rate": 0.0002, "epoch": 1.820580474934037, "step": 1380}, {"loss": 1.0903, "grad_norm": 0.4385245740413666, "learning_rate": 0.0002, "epoch": 1.83377308707124, "step": 1390}, {"loss": 1.1175, "grad_norm": 0.529797375202179, "learning_rate": 0.0002, "epoch": 1.8469656992084431, "step": 1400}, {"loss": 1.0577, "grad_norm": 0.481567919254303, "learning_rate": 0.0002, "epoch": 1.8601583113456464, "step": 1410}, {"loss": 1.0581, "grad_norm": 0.34787362813949585, "learning_rate": 0.0002, "epoch": 1.8733509234828496, "step": 1420}, {"loss": 1.0536, "grad_norm": 0.6402362585067749, "learning_rate": 0.0002, "epoch": 1.8865435356200528, "step": 1430}, {"loss": 1.0787, "grad_norm": 0.3461322784423828, "learning_rate": 0.0002, "epoch": 1.899736147757256, "step": 1440}, {"loss": 1.0925, "grad_norm": 0.44005653262138367, "learning_rate": 0.0002, "epoch": 1.912928759894459, "step": 1450}, {"loss": 1.0414, "grad_norm": 0.4064280688762665, "learning_rate": 0.0002, "epoch": 1.9261213720316621, "step": 1460}, {"loss": 1.0608, "grad_norm": 0.5236523151397705, "learning_rate": 0.0002, "epoch": 1.9393139841688654, "step": 1470}, {"loss": 1.0572, "grad_norm": 0.41030219197273254, "learning_rate": 0.0002, "epoch": 1.9525065963060686, "step": 1480}, {"loss": 1.1204, "grad_norm": 0.39805835485458374, "learning_rate": 0.0002, "epoch": 1.9656992084432718, "step": 1490}, {"loss": 1.0364, "grad_norm": 0.42974501848220825, "learning_rate": 0.0002, "epoch": 1.978891820580475, "step": 1500}, {"loss": 1.1169, "grad_norm": 0.4688243865966797, "learning_rate": 0.0002, "epoch": 1.992084432717678, "step": 1510}, {"eval_loss": 1.1874967813491821, "eval_runtime": 71.9523, "eval_samples_per_second": 5.99, "eval_steps_per_second": 0.75, "epoch": 2.0, "step": 1516}, {"loss": 1.0401, "grad_norm": 0.4121631383895874, "learning_rate": 0.0002, "epoch": 2.005277044854881, "step": 1520}, {"loss": 0.9384, "grad_norm": 0.4844197928905487, "learning_rate": 0.0002, "epoch": 2.0184696569920844, "step": 1530}, {"loss": 0.8686, "grad_norm": 0.45408546924591064, "learning_rate": 0.0002, "epoch": 2.0316622691292876, "step": 1540}, {"loss": 1.0399, "grad_norm": 0.48662951588630676, "learning_rate": 0.0002, "epoch": 2.044854881266491, "step": 1550}, {"loss": 0.936, "grad_norm": 0.7195899486541748, "learning_rate": 0.0002, "epoch": 2.058047493403694, "step": 1560}, {"loss": 0.9486, "grad_norm": 0.5071077346801758, "learning_rate": 0.0002, "epoch": 2.0712401055408973, "step": 1570}, {"loss": 1.0055, "grad_norm": 0.7473958730697632, "learning_rate": 0.0002, "epoch": 2.0844327176781, "step": 1580}, {"loss": 0.8309, "grad_norm": 0.5509232878684998, "learning_rate": 0.0002, "epoch": 2.0976253298153034, "step": 1590}, {"loss": 0.9181, "grad_norm": 0.5108042359352112, "learning_rate": 0.0002, "epoch": 2.1108179419525066, "step": 1600}, {"loss": 0.9499, "grad_norm": 0.42331448197364807, "learning_rate": 0.0002, "epoch": 2.12401055408971, "step": 1610}, {"loss": 0.9359, "grad_norm": 0.46621623635292053, "learning_rate": 0.0002, "epoch": 2.137203166226913, "step": 1620}, {"loss": 0.9065, "grad_norm": 0.43802836537361145, "learning_rate": 0.0002, "epoch": 2.150395778364116, "step": 1630}, {"loss": 0.9375, "grad_norm": 0.49908021092414856, "learning_rate": 0.0002, "epoch": 2.163588390501319, "step": 1640}, {"loss": 1.0389, "grad_norm": 0.4195636808872223, "learning_rate": 0.0002, "epoch": 2.1767810026385224, "step": 1650}, {"loss": 0.8501, "grad_norm": 0.49515822529792786, "learning_rate": 0.0002, "epoch": 2.1899736147757256, "step": 1660}, {"loss": 1.0557, "grad_norm": 0.4607589542865753, "learning_rate": 0.0002, "epoch": 2.203166226912929, "step": 1670}, {"loss": 1.0206, "grad_norm": 0.4489196836948395, "learning_rate": 0.0002, "epoch": 2.216358839050132, "step": 1680}, {"loss": 0.9657, "grad_norm": 0.49300864338874817, "learning_rate": 0.0002, "epoch": 2.229551451187335, "step": 1690}, {"loss": 0.8371, "grad_norm": 0.6624954342842102, "learning_rate": 0.0002, "epoch": 2.242744063324538, "step": 1700}, {"loss": 0.8555, "grad_norm": 0.8391500115394592, "learning_rate": 0.0002, "epoch": 2.2559366754617414, "step": 1710}, {"loss": 1.0113, "grad_norm": 0.5193073749542236, "learning_rate": 0.0002, "epoch": 2.2691292875989446, "step": 1720}, {"loss": 0.9979, "grad_norm": 0.6180613040924072, "learning_rate": 0.0002, "epoch": 2.282321899736148, "step": 1730}, {"loss": 0.9579, "grad_norm": 0.591191291809082, "learning_rate": 0.0002, "epoch": 2.295514511873351, "step": 1740}, {"loss": 0.8879, "grad_norm": 0.546897828578949, "learning_rate": 0.0002, "epoch": 2.308707124010554, "step": 1750}, {"loss": 0.9321, "grad_norm": 0.5470401644706726, "learning_rate": 0.0002, "epoch": 2.321899736147757, "step": 1760}, {"loss": 0.9104, "grad_norm": 0.4590282738208771, "learning_rate": 0.0002, "epoch": 2.3350923482849604, "step": 1770}, {"loss": 1.0384, "grad_norm": 0.622164785861969, "learning_rate": 0.0002, "epoch": 2.3482849604221636, "step": 1780}, {"loss": 1.0297, "grad_norm": 0.5753812193870544, "learning_rate": 0.0002, "epoch": 2.361477572559367, "step": 1790}, {"loss": 0.9569, "grad_norm": 0.47958624362945557, "learning_rate": 0.0002, "epoch": 2.37467018469657, "step": 1800}, {"loss": 1.0519, "grad_norm": 0.48042672872543335, "learning_rate": 0.0002, "epoch": 2.387862796833773, "step": 1810}, {"loss": 0.9042, "grad_norm": 0.44586366415023804, "learning_rate": 0.0002, "epoch": 2.401055408970976, "step": 1820}, {"loss": 0.9783, "grad_norm": 0.7239416837692261, "learning_rate": 0.0002, "epoch": 2.4142480211081794, "step": 1830}, {"loss": 0.8818, "grad_norm": 0.5515341758728027, "learning_rate": 0.0002, "epoch": 2.4274406332453826, "step": 1840}, {"loss": 0.9503, "grad_norm": 0.6280064582824707, "learning_rate": 0.0002, "epoch": 2.440633245382586, "step": 1850}, {"loss": 0.8943, "grad_norm": 0.4832057058811188, "learning_rate": 0.0002, "epoch": 2.453825857519789, "step": 1860}, {"loss": 0.8744, "grad_norm": 0.5789321064949036, "learning_rate": 0.0002, "epoch": 2.467018469656992, "step": 1870}, {"loss": 0.9332, "grad_norm": 0.48491886258125305, "learning_rate": 0.0002, "epoch": 2.480211081794195, "step": 1880}, {"loss": 0.943, "grad_norm": 0.532365620136261, "learning_rate": 0.0002, "epoch": 2.4934036939313984, "step": 1890}, {"loss": 0.9734, "grad_norm": 0.7087852954864502, "learning_rate": 0.0002, "epoch": 2.5065963060686016, "step": 1900}, {"loss": 0.9767, "grad_norm": 0.48157402873039246, "learning_rate": 0.0002, "epoch": 2.519788918205805, "step": 1910}, {"loss": 0.9851, "grad_norm": 0.5886041522026062, "learning_rate": 0.0002, "epoch": 2.532981530343008, "step": 1920}, {"loss": 1.0144, "grad_norm": 0.6332622766494751, "learning_rate": 0.0002, "epoch": 2.5461741424802113, "step": 1930}, {"loss": 0.9516, "grad_norm": 0.5463117957115173, "learning_rate": 0.0002, "epoch": 2.559366754617414, "step": 1940}, {"loss": 0.9373, "grad_norm": 0.5432228446006775, "learning_rate": 0.0002, "epoch": 2.5725593667546174, "step": 1950}, {"loss": 0.8974, "grad_norm": 0.5929186940193176, "learning_rate": 0.0002, "epoch": 2.5857519788918206, "step": 1960}, {"loss": 1.0062, "grad_norm": 0.5120641589164734, "learning_rate": 0.0002, "epoch": 2.598944591029024, "step": 1970}, {"loss": 0.9143, "grad_norm": 0.5372339487075806, "learning_rate": 0.0002, "epoch": 2.6121372031662267, "step": 1980}, {"loss": 1.0786, "grad_norm": 0.5519838929176331, "learning_rate": 0.0002, "epoch": 2.62532981530343, "step": 1990}, {"loss": 1.021, "grad_norm": 0.7304037809371948, "learning_rate": 0.0002, "epoch": 2.638522427440633, "step": 2000}, {"loss": 0.8708, "grad_norm": 0.6182340979576111, "learning_rate": 0.0002, "epoch": 2.6517150395778364, "step": 2010}, {"loss": 1.0501, "grad_norm": 0.4874444305896759, "learning_rate": 0.0002, "epoch": 2.6649076517150396, "step": 2020}, {"loss": 0.9612, "grad_norm": 0.5850239396095276, "learning_rate": 0.0002, "epoch": 2.678100263852243, "step": 2030}, {"loss": 0.9808, "grad_norm": 0.6495311856269836, "learning_rate": 0.0002, "epoch": 2.691292875989446, "step": 2040}, {"loss": 1.0187, "grad_norm": 1.002830147743225, "learning_rate": 0.0002, "epoch": 2.7044854881266494, "step": 2050}, {"loss": 1.0386, "grad_norm": 0.49076753854751587, "learning_rate": 0.0002, "epoch": 2.717678100263852, "step": 2060}, {"loss": 0.9416, "grad_norm": 0.4736326336860657, "learning_rate": 0.0002, "epoch": 2.7308707124010554, "step": 2070}, {"loss": 0.8424, "grad_norm": 0.5527601838111877, "learning_rate": 0.0002, "epoch": 2.7440633245382586, "step": 2080}, {"loss": 0.9149, "grad_norm": 0.7295718193054199, "learning_rate": 0.0002, "epoch": 2.757255936675462, "step": 2090}, {"loss": 0.9032, "grad_norm": 0.5437536835670471, "learning_rate": 0.0002, "epoch": 2.7704485488126647, "step": 2100}, {"loss": 0.8254, "grad_norm": 0.5997128486633301, "learning_rate": 0.0002, "epoch": 2.783641160949868, "step": 2110}, {"loss": 0.976, "grad_norm": 0.6498191356658936, "learning_rate": 0.0002, "epoch": 2.796833773087071, "step": 2120}, {"loss": 0.9543, "grad_norm": 0.5237268805503845, "learning_rate": 0.0002, "epoch": 2.8100263852242744, "step": 2130}, {"loss": 0.9302, "grad_norm": 0.6033027172088623, "learning_rate": 0.0002, "epoch": 2.8232189973614776, "step": 2140}, {"loss": 0.9625, "grad_norm": 0.6077138781547546, "learning_rate": 0.0002, "epoch": 2.836411609498681, "step": 2150}, {"loss": 0.9347, "grad_norm": 0.4127797484397888, "learning_rate": 0.0002, "epoch": 2.849604221635884, "step": 2160}, {"loss": 1.0459, "grad_norm": 0.8448635339736938, "learning_rate": 0.0002, "epoch": 2.862796833773087, "step": 2170}, {"loss": 0.8185, "grad_norm": 0.5669729113578796, "learning_rate": 0.0002, "epoch": 2.87598944591029, "step": 2180}, {"loss": 0.8555, "grad_norm": 0.510231077671051, "learning_rate": 0.0002, "epoch": 2.8891820580474934, "step": 2190}, {"loss": 0.9267, "grad_norm": 0.8072245121002197, "learning_rate": 0.0002, "epoch": 2.9023746701846966, "step": 2200}, {"loss": 0.9685, "grad_norm": 0.6055923104286194, "learning_rate": 0.0002, "epoch": 2.9155672823219, "step": 2210}, {"loss": 0.9157, "grad_norm": 0.7384416460990906, "learning_rate": 0.0002, "epoch": 2.9287598944591027, "step": 2220}, {"loss": 0.9712, "grad_norm": 0.4922751784324646, "learning_rate": 0.0002, "epoch": 2.941952506596306, "step": 2230}, {"loss": 0.9487, "grad_norm": 0.6039906740188599, "learning_rate": 0.0002, "epoch": 2.955145118733509, "step": 2240}, {"loss": 0.9776, "grad_norm": 0.4751701354980469, "learning_rate": 0.0002, "epoch": 2.9683377308707124, "step": 2250}, {"loss": 1.0619, "grad_norm": 0.5698353052139282, "learning_rate": 0.0002, "epoch": 2.9815303430079156, "step": 2260}, {"loss": 1.1184, "grad_norm": 0.893563449382782, "learning_rate": 0.0002, "epoch": 2.994722955145119, "step": 2270}]} +{"epoch": 4.0, "step": 3032, "epoch_duration": 2069.750212907791, "total_accumulated_duration": 9012.241269350052, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9466, "grad_norm": 0.7545632123947144, "learning_rate": 0.0002, "epoch": 0.013192612137203167, "step": 10}, {"loss": 1.4909, "grad_norm": 0.5787661075592041, "learning_rate": 0.0002, "epoch": 0.026385224274406333, "step": 20}, {"loss": 1.3906, "grad_norm": 0.8616093993186951, "learning_rate": 0.0002, "epoch": 0.0395778364116095, "step": 30}, {"loss": 1.4116, "grad_norm": 0.42088547348976135, "learning_rate": 0.0002, "epoch": 0.052770448548812667, "step": 40}, {"loss": 1.3446, "grad_norm": 0.47704678773880005, "learning_rate": 0.0002, "epoch": 0.06596306068601583, "step": 50}, {"loss": 1.2476, "grad_norm": 0.5763994455337524, "learning_rate": 0.0002, "epoch": 0.079155672823219, "step": 60}, {"loss": 1.2268, "grad_norm": 0.4579846262931824, "learning_rate": 0.0002, "epoch": 0.09234828496042216, "step": 70}, {"loss": 1.415, "grad_norm": 0.46623846888542175, "learning_rate": 0.0002, "epoch": 0.10554089709762533, "step": 80}, {"loss": 1.2849, "grad_norm": 0.4206956624984741, "learning_rate": 0.0002, "epoch": 0.11873350923482849, "step": 90}, {"loss": 1.1608, "grad_norm": 0.41896629333496094, "learning_rate": 0.0002, "epoch": 0.13192612137203166, "step": 100}, {"loss": 1.2617, "grad_norm": 0.3459089398384094, "learning_rate": 0.0002, "epoch": 0.14511873350923482, "step": 110}, {"loss": 1.2858, "grad_norm": 0.4587327837944031, "learning_rate": 0.0002, "epoch": 0.158311345646438, "step": 120}, {"loss": 1.2241, "grad_norm": 0.433525413274765, "learning_rate": 0.0002, "epoch": 0.17150395778364116, "step": 130}, {"loss": 1.2269, "grad_norm": 0.39253175258636475, "learning_rate": 0.0002, "epoch": 0.18469656992084432, "step": 140}, {"loss": 1.2224, "grad_norm": 0.3602290749549866, "learning_rate": 0.0002, "epoch": 0.19788918205804748, "step": 150}, {"loss": 1.2834, "grad_norm": 0.41160839796066284, "learning_rate": 0.0002, "epoch": 0.21108179419525067, "step": 160}, {"loss": 1.1986, "grad_norm": 0.7213630080223083, "learning_rate": 0.0002, "epoch": 0.22427440633245382, "step": 170}, {"loss": 1.2215, "grad_norm": 0.39086055755615234, "learning_rate": 0.0002, "epoch": 0.23746701846965698, "step": 180}, {"loss": 1.3315, "grad_norm": 0.4465520977973938, "learning_rate": 0.0002, "epoch": 0.25065963060686014, "step": 190}, {"loss": 1.3798, "grad_norm": 1.814679741859436, "learning_rate": 0.0002, "epoch": 0.2638522427440633, "step": 200}, {"loss": 1.2259, "grad_norm": 0.5026423931121826, "learning_rate": 0.0002, "epoch": 0.2770448548812665, "step": 210}, {"loss": 1.3306, "grad_norm": 0.4156292974948883, "learning_rate": 0.0002, "epoch": 0.29023746701846964, "step": 220}, {"loss": 1.266, "grad_norm": 0.40813493728637695, "learning_rate": 0.0002, "epoch": 0.3034300791556728, "step": 230}, {"loss": 1.1533, "grad_norm": 0.3304787874221802, "learning_rate": 0.0002, "epoch": 0.316622691292876, "step": 240}, {"loss": 1.3154, "grad_norm": 0.46139976382255554, "learning_rate": 0.0002, "epoch": 0.32981530343007914, "step": 250}, {"loss": 1.1365, "grad_norm": 0.37518271803855896, "learning_rate": 0.0002, "epoch": 0.34300791556728233, "step": 260}, {"loss": 1.2004, "grad_norm": 0.35586467385292053, "learning_rate": 0.0002, "epoch": 0.3562005277044855, "step": 270}, {"loss": 1.2609, "grad_norm": 0.32441186904907227, "learning_rate": 0.0002, "epoch": 0.36939313984168864, "step": 280}, {"loss": 1.212, "grad_norm": 0.3198683261871338, "learning_rate": 0.0002, "epoch": 0.38258575197889183, "step": 290}, {"loss": 1.2465, "grad_norm": 0.33663108944892883, "learning_rate": 0.0002, "epoch": 0.39577836411609496, "step": 300}, {"loss": 1.1702, "grad_norm": 0.3711244761943817, "learning_rate": 0.0002, "epoch": 0.40897097625329815, "step": 310}, {"loss": 1.0871, "grad_norm": 0.3209651708602905, "learning_rate": 0.0002, "epoch": 0.42216358839050133, "step": 320}, {"loss": 1.2728, "grad_norm": 0.5152716040611267, "learning_rate": 0.0002, "epoch": 0.43535620052770446, "step": 330}, {"loss": 1.1833, "grad_norm": 0.5431376695632935, "learning_rate": 0.0002, "epoch": 0.44854881266490765, "step": 340}, {"loss": 1.1344, "grad_norm": 0.3069997727870941, "learning_rate": 0.0002, "epoch": 0.46174142480211083, "step": 350}, {"loss": 1.2742, "grad_norm": 0.34260064363479614, "learning_rate": 0.0002, "epoch": 0.47493403693931396, "step": 360}, {"loss": 1.3161, "grad_norm": 0.345653235912323, "learning_rate": 0.0002, "epoch": 0.48812664907651715, "step": 370}, {"loss": 1.1552, "grad_norm": 0.46222734451293945, "learning_rate": 0.0002, "epoch": 0.5013192612137203, "step": 380}, {"loss": 1.0616, "grad_norm": 0.27301734685897827, "learning_rate": 0.0002, "epoch": 0.5145118733509235, "step": 390}, {"loss": 1.1944, "grad_norm": 0.29048439860343933, "learning_rate": 0.0002, "epoch": 0.5277044854881267, "step": 400}, {"loss": 1.0956, "grad_norm": 0.32927802205085754, "learning_rate": 0.0002, "epoch": 0.5408970976253298, "step": 410}, {"loss": 1.2362, "grad_norm": 0.3336397409439087, "learning_rate": 0.0002, "epoch": 0.554089709762533, "step": 420}, {"loss": 1.1445, "grad_norm": 0.4007597267627716, "learning_rate": 0.0002, "epoch": 0.5672823218997362, "step": 430}, {"loss": 1.2731, "grad_norm": 0.36144956946372986, "learning_rate": 0.0002, "epoch": 0.5804749340369393, "step": 440}, {"loss": 1.1604, "grad_norm": 0.6331009864807129, "learning_rate": 0.0002, "epoch": 0.5936675461741425, "step": 450}, {"loss": 1.1692, "grad_norm": 0.41469088196754456, "learning_rate": 0.0002, "epoch": 0.6068601583113457, "step": 460}, {"loss": 1.2372, "grad_norm": 0.4388185143470764, "learning_rate": 0.0002, "epoch": 0.6200527704485488, "step": 470}, {"loss": 1.2541, "grad_norm": 0.3738141655921936, "learning_rate": 0.0002, "epoch": 0.633245382585752, "step": 480}, {"loss": 1.1265, "grad_norm": 0.7212023138999939, "learning_rate": 0.0002, "epoch": 0.6464379947229552, "step": 490}, {"loss": 1.2786, "grad_norm": 0.2972351014614105, "learning_rate": 0.0002, "epoch": 0.6596306068601583, "step": 500}, {"loss": 1.1739, "grad_norm": 0.45293179154396057, "learning_rate": 0.0002, "epoch": 0.6728232189973615, "step": 510}, {"loss": 1.255, "grad_norm": 0.4319860637187958, "learning_rate": 0.0002, "epoch": 0.6860158311345647, "step": 520}, {"loss": 1.0959, "grad_norm": 0.3050215542316437, "learning_rate": 0.0002, "epoch": 0.6992084432717678, "step": 530}, {"loss": 1.1608, "grad_norm": 0.3552611172199249, "learning_rate": 0.0002, "epoch": 0.712401055408971, "step": 540}, {"loss": 1.2546, "grad_norm": 0.3631151020526886, "learning_rate": 0.0002, "epoch": 0.7255936675461742, "step": 550}, {"loss": 1.1371, "grad_norm": 0.28177931904792786, "learning_rate": 0.0002, "epoch": 0.7387862796833773, "step": 560}, {"loss": 1.2081, "grad_norm": 0.359764039516449, "learning_rate": 0.0002, "epoch": 0.7519788918205804, "step": 570}, {"loss": 1.1356, "grad_norm": 0.3970327377319336, "learning_rate": 0.0002, "epoch": 0.7651715039577837, "step": 580}, {"loss": 1.309, "grad_norm": 0.3541001081466675, "learning_rate": 0.0002, "epoch": 0.7783641160949868, "step": 590}, {"loss": 1.2061, "grad_norm": 0.3478573262691498, "learning_rate": 0.0002, "epoch": 0.7915567282321899, "step": 600}, {"loss": 1.1864, "grad_norm": 0.3900321424007416, "learning_rate": 0.0002, "epoch": 0.8047493403693932, "step": 610}, {"loss": 1.1358, "grad_norm": 0.3443238437175751, "learning_rate": 0.0002, "epoch": 0.8179419525065963, "step": 620}, {"loss": 1.3232, "grad_norm": 0.44238781929016113, "learning_rate": 0.0002, "epoch": 0.8311345646437994, "step": 630}, {"loss": 1.1247, "grad_norm": 0.36339467763900757, "learning_rate": 0.0002, "epoch": 0.8443271767810027, "step": 640}, {"loss": 1.1049, "grad_norm": 0.6243070363998413, "learning_rate": 0.0002, "epoch": 0.8575197889182058, "step": 650}, {"loss": 1.1943, "grad_norm": 0.3209173381328583, "learning_rate": 0.0002, "epoch": 0.8707124010554089, "step": 660}, {"loss": 1.1927, "grad_norm": 0.35017991065979004, "learning_rate": 0.0002, "epoch": 0.8839050131926122, "step": 670}, {"loss": 1.1603, "grad_norm": 0.3247159421443939, "learning_rate": 0.0002, "epoch": 0.8970976253298153, "step": 680}, {"loss": 1.2298, "grad_norm": 0.4091894030570984, "learning_rate": 0.0002, "epoch": 0.9102902374670184, "step": 690}, {"loss": 1.2756, "grad_norm": 0.3975585997104645, "learning_rate": 0.0002, "epoch": 0.9234828496042217, "step": 700}, {"loss": 1.281, "grad_norm": 0.3666245937347412, "learning_rate": 0.0002, "epoch": 0.9366754617414248, "step": 710}, {"loss": 1.2855, "grad_norm": 0.45216917991638184, "learning_rate": 0.0002, "epoch": 0.9498680738786279, "step": 720}, {"loss": 1.2217, "grad_norm": 0.36108118295669556, "learning_rate": 0.0002, "epoch": 0.9630606860158312, "step": 730}, {"loss": 1.17, "grad_norm": 0.44550251960754395, "learning_rate": 0.0002, "epoch": 0.9762532981530343, "step": 740}, {"loss": 1.0672, "grad_norm": 0.29801255464553833, "learning_rate": 0.0002, "epoch": 0.9894459102902374, "step": 750}, {"eval_loss": 1.203244686126709, "eval_runtime": 76.0457, "eval_samples_per_second": 5.668, "eval_steps_per_second": 0.71, "epoch": 1.0, "step": 758}, {"loss": 1.0748, "grad_norm": 0.4096551239490509, "learning_rate": 0.0002, "epoch": 1.0026385224274406, "step": 760}, {"loss": 1.1537, "grad_norm": 0.2649582326412201, "learning_rate": 0.0002, "epoch": 1.0158311345646438, "step": 770}, {"loss": 1.1676, "grad_norm": 0.3100722134113312, "learning_rate": 0.0002, "epoch": 1.029023746701847, "step": 780}, {"loss": 1.1736, "grad_norm": 0.3911755383014679, "learning_rate": 0.0002, "epoch": 1.04221635883905, "step": 790}, {"loss": 1.0354, "grad_norm": 0.4600953757762909, "learning_rate": 0.0002, "epoch": 1.0554089709762533, "step": 800}, {"loss": 1.2869, "grad_norm": 0.28671619296073914, "learning_rate": 0.0002, "epoch": 1.0686015831134565, "step": 810}, {"loss": 1.1441, "grad_norm": 0.47282642126083374, "learning_rate": 0.0002, "epoch": 1.0817941952506596, "step": 820}, {"loss": 1.1368, "grad_norm": 0.690073549747467, "learning_rate": 0.0002, "epoch": 1.0949868073878628, "step": 830}, {"loss": 0.9944, "grad_norm": 0.7317902445793152, "learning_rate": 0.0002, "epoch": 1.108179419525066, "step": 840}, {"loss": 1.108, "grad_norm": 0.44215938448905945, "learning_rate": 0.0002, "epoch": 1.121372031662269, "step": 850}, {"loss": 1.0558, "grad_norm": 0.33875149488449097, "learning_rate": 0.0002, "epoch": 1.1345646437994723, "step": 860}, {"loss": 1.0471, "grad_norm": 0.3700002431869507, "learning_rate": 0.0002, "epoch": 1.1477572559366755, "step": 870}, {"loss": 1.0121, "grad_norm": 0.41173291206359863, "learning_rate": 0.0002, "epoch": 1.1609498680738786, "step": 880}, {"loss": 1.0312, "grad_norm": 0.5253589749336243, "learning_rate": 0.0002, "epoch": 1.1741424802110818, "step": 890}, {"loss": 1.2238, "grad_norm": 0.3912237286567688, "learning_rate": 0.0002, "epoch": 1.187335092348285, "step": 900}, {"loss": 0.9434, "grad_norm": 0.40990331768989563, "learning_rate": 0.0002, "epoch": 1.200527704485488, "step": 910}, {"loss": 1.0493, "grad_norm": 0.40377968549728394, "learning_rate": 0.0002, "epoch": 1.2137203166226913, "step": 920}, {"loss": 1.1496, "grad_norm": 0.4605846405029297, "learning_rate": 0.0002, "epoch": 1.2269129287598945, "step": 930}, {"loss": 1.1173, "grad_norm": 0.31564897298812866, "learning_rate": 0.0002, "epoch": 1.2401055408970976, "step": 940}, {"loss": 1.0547, "grad_norm": 0.39808550477027893, "learning_rate": 0.0002, "epoch": 1.2532981530343008, "step": 950}, {"loss": 1.1367, "grad_norm": 0.3762115240097046, "learning_rate": 0.0002, "epoch": 1.266490765171504, "step": 960}, {"loss": 1.1596, "grad_norm": 0.4174984097480774, "learning_rate": 0.0002, "epoch": 1.279683377308707, "step": 970}, {"loss": 1.1327, "grad_norm": 0.5263054966926575, "learning_rate": 0.0002, "epoch": 1.2928759894459103, "step": 980}, {"loss": 1.0339, "grad_norm": 0.41673699021339417, "learning_rate": 0.0002, "epoch": 1.3060686015831133, "step": 990}, {"loss": 1.1198, "grad_norm": 0.9613684415817261, "learning_rate": 0.0002, "epoch": 1.3192612137203166, "step": 1000}, {"loss": 1.0444, "grad_norm": 0.3690216839313507, "learning_rate": 0.0002, "epoch": 1.3324538258575198, "step": 1010}, {"loss": 1.0473, "grad_norm": 0.521821141242981, "learning_rate": 0.0002, "epoch": 1.345646437994723, "step": 1020}, {"loss": 1.1065, "grad_norm": 0.3353094160556793, "learning_rate": 0.0002, "epoch": 1.358839050131926, "step": 1030}, {"loss": 1.1286, "grad_norm": 0.3843843936920166, "learning_rate": 0.0002, "epoch": 1.3720316622691293, "step": 1040}, {"loss": 1.1369, "grad_norm": 0.372514545917511, "learning_rate": 0.0002, "epoch": 1.3852242744063323, "step": 1050}, {"loss": 1.0041, "grad_norm": 0.34537771344184875, "learning_rate": 0.0002, "epoch": 1.3984168865435356, "step": 1060}, {"loss": 1.0251, "grad_norm": 0.45349085330963135, "learning_rate": 0.0002, "epoch": 1.4116094986807388, "step": 1070}, {"loss": 1.1873, "grad_norm": 0.5120177268981934, "learning_rate": 0.0002, "epoch": 1.424802110817942, "step": 1080}, {"loss": 1.0872, "grad_norm": 0.42800238728523254, "learning_rate": 0.0002, "epoch": 1.437994722955145, "step": 1090}, {"loss": 1.0734, "grad_norm": 0.343832790851593, "learning_rate": 0.0002, "epoch": 1.4511873350923483, "step": 1100}, {"loss": 1.1286, "grad_norm": 0.3829841911792755, "learning_rate": 0.0002, "epoch": 1.4643799472295513, "step": 1110}, {"loss": 1.1268, "grad_norm": 0.4289931058883667, "learning_rate": 0.0002, "epoch": 1.4775725593667546, "step": 1120}, {"loss": 1.0676, "grad_norm": 0.42750850319862366, "learning_rate": 0.0002, "epoch": 1.4907651715039578, "step": 1130}, {"loss": 1.072, "grad_norm": 0.34328413009643555, "learning_rate": 0.0002, "epoch": 1.503957783641161, "step": 1140}, {"loss": 1.0863, "grad_norm": 0.349096417427063, "learning_rate": 0.0002, "epoch": 1.517150395778364, "step": 1150}, {"loss": 1.1462, "grad_norm": 0.7700717449188232, "learning_rate": 0.0002, "epoch": 1.5303430079155673, "step": 1160}, {"loss": 0.9528, "grad_norm": 0.39294949173927307, "learning_rate": 0.0002, "epoch": 1.5435356200527703, "step": 1170}, {"loss": 1.1603, "grad_norm": 0.36173608899116516, "learning_rate": 0.0002, "epoch": 1.5567282321899736, "step": 1180}, {"loss": 1.1508, "grad_norm": 0.6034277677536011, "learning_rate": 0.0002, "epoch": 1.5699208443271768, "step": 1190}, {"loss": 1.0105, "grad_norm": 0.36694103479385376, "learning_rate": 0.0002, "epoch": 1.58311345646438, "step": 1200}, {"loss": 1.1479, "grad_norm": 0.4727209508419037, "learning_rate": 0.0002, "epoch": 1.596306068601583, "step": 1210}, {"loss": 1.0689, "grad_norm": 0.6482883095741272, "learning_rate": 0.0002, "epoch": 1.6094986807387863, "step": 1220}, {"loss": 1.1405, "grad_norm": 0.5238035917282104, "learning_rate": 0.0002, "epoch": 1.6226912928759893, "step": 1230}, {"loss": 1.0596, "grad_norm": 0.4812222421169281, "learning_rate": 0.0002, "epoch": 1.6358839050131926, "step": 1240}, {"loss": 1.2729, "grad_norm": 0.7131702303886414, "learning_rate": 0.0002, "epoch": 1.6490765171503958, "step": 1250}, {"loss": 0.9832, "grad_norm": 0.3803327977657318, "learning_rate": 0.0002, "epoch": 1.662269129287599, "step": 1260}, {"loss": 1.1433, "grad_norm": 0.3745088577270508, "learning_rate": 0.0002, "epoch": 1.675461741424802, "step": 1270}, {"loss": 1.1018, "grad_norm": 0.4427378475666046, "learning_rate": 0.0002, "epoch": 1.6886543535620053, "step": 1280}, {"loss": 1.0619, "grad_norm": 0.797478973865509, "learning_rate": 0.0002, "epoch": 1.7018469656992083, "step": 1290}, {"loss": 1.0853, "grad_norm": 0.503620982170105, "learning_rate": 0.0002, "epoch": 1.7150395778364116, "step": 1300}, {"loss": 1.1324, "grad_norm": 0.4132426381111145, "learning_rate": 0.0002, "epoch": 1.7282321899736148, "step": 1310}, {"loss": 1.172, "grad_norm": 0.41811656951904297, "learning_rate": 0.0002, "epoch": 1.741424802110818, "step": 1320}, {"loss": 1.0903, "grad_norm": 0.40647849440574646, "learning_rate": 0.0002, "epoch": 1.754617414248021, "step": 1330}, {"loss": 1.1316, "grad_norm": 0.42138347029685974, "learning_rate": 0.0002, "epoch": 1.767810026385224, "step": 1340}, {"loss": 0.9955, "grad_norm": 0.46523579955101013, "learning_rate": 0.0002, "epoch": 1.7810026385224274, "step": 1350}, {"loss": 1.0307, "grad_norm": 0.39760419726371765, "learning_rate": 0.0002, "epoch": 1.7941952506596306, "step": 1360}, {"loss": 1.1218, "grad_norm": 0.37993717193603516, "learning_rate": 0.0002, "epoch": 1.8073878627968338, "step": 1370}, {"loss": 1.0921, "grad_norm": 0.5404181480407715, "learning_rate": 0.0002, "epoch": 1.820580474934037, "step": 1380}, {"loss": 1.0903, "grad_norm": 0.4385245740413666, "learning_rate": 0.0002, "epoch": 1.83377308707124, "step": 1390}, {"loss": 1.1175, "grad_norm": 0.529797375202179, "learning_rate": 0.0002, "epoch": 1.8469656992084431, "step": 1400}, {"loss": 1.0577, "grad_norm": 0.481567919254303, "learning_rate": 0.0002, "epoch": 1.8601583113456464, "step": 1410}, {"loss": 1.0581, "grad_norm": 0.34787362813949585, "learning_rate": 0.0002, "epoch": 1.8733509234828496, "step": 1420}, {"loss": 1.0536, "grad_norm": 0.6402362585067749, "learning_rate": 0.0002, "epoch": 1.8865435356200528, "step": 1430}, {"loss": 1.0787, "grad_norm": 0.3461322784423828, "learning_rate": 0.0002, "epoch": 1.899736147757256, "step": 1440}, {"loss": 1.0925, "grad_norm": 0.44005653262138367, "learning_rate": 0.0002, "epoch": 1.912928759894459, "step": 1450}, {"loss": 1.0414, "grad_norm": 0.4064280688762665, "learning_rate": 0.0002, "epoch": 1.9261213720316621, "step": 1460}, {"loss": 1.0608, "grad_norm": 0.5236523151397705, "learning_rate": 0.0002, "epoch": 1.9393139841688654, "step": 1470}, {"loss": 1.0572, "grad_norm": 0.41030219197273254, "learning_rate": 0.0002, "epoch": 1.9525065963060686, "step": 1480}, {"loss": 1.1204, "grad_norm": 0.39805835485458374, "learning_rate": 0.0002, "epoch": 1.9656992084432718, "step": 1490}, {"loss": 1.0364, "grad_norm": 0.42974501848220825, "learning_rate": 0.0002, "epoch": 1.978891820580475, "step": 1500}, {"loss": 1.1169, "grad_norm": 0.4688243865966797, "learning_rate": 0.0002, "epoch": 1.992084432717678, "step": 1510}, {"eval_loss": 1.1874967813491821, "eval_runtime": 71.9523, "eval_samples_per_second": 5.99, "eval_steps_per_second": 0.75, "epoch": 2.0, "step": 1516}, {"loss": 1.0401, "grad_norm": 0.4121631383895874, "learning_rate": 0.0002, "epoch": 2.005277044854881, "step": 1520}, {"loss": 0.9384, "grad_norm": 0.4844197928905487, "learning_rate": 0.0002, "epoch": 2.0184696569920844, "step": 1530}, {"loss": 0.8686, "grad_norm": 0.45408546924591064, "learning_rate": 0.0002, "epoch": 2.0316622691292876, "step": 1540}, {"loss": 1.0399, "grad_norm": 0.48662951588630676, "learning_rate": 0.0002, "epoch": 2.044854881266491, "step": 1550}, {"loss": 0.936, "grad_norm": 0.7195899486541748, "learning_rate": 0.0002, "epoch": 2.058047493403694, "step": 1560}, {"loss": 0.9486, "grad_norm": 0.5071077346801758, "learning_rate": 0.0002, "epoch": 2.0712401055408973, "step": 1570}, {"loss": 1.0055, "grad_norm": 0.7473958730697632, "learning_rate": 0.0002, "epoch": 2.0844327176781, "step": 1580}, {"loss": 0.8309, "grad_norm": 0.5509232878684998, "learning_rate": 0.0002, "epoch": 2.0976253298153034, "step": 1590}, {"loss": 0.9181, "grad_norm": 0.5108042359352112, "learning_rate": 0.0002, "epoch": 2.1108179419525066, "step": 1600}, {"loss": 0.9499, "grad_norm": 0.42331448197364807, "learning_rate": 0.0002, "epoch": 2.12401055408971, "step": 1610}, {"loss": 0.9359, "grad_norm": 0.46621623635292053, "learning_rate": 0.0002, "epoch": 2.137203166226913, "step": 1620}, {"loss": 0.9065, "grad_norm": 0.43802836537361145, "learning_rate": 0.0002, "epoch": 2.150395778364116, "step": 1630}, {"loss": 0.9375, "grad_norm": 0.49908021092414856, "learning_rate": 0.0002, "epoch": 2.163588390501319, "step": 1640}, {"loss": 1.0389, "grad_norm": 0.4195636808872223, "learning_rate": 0.0002, "epoch": 2.1767810026385224, "step": 1650}, {"loss": 0.8501, "grad_norm": 0.49515822529792786, "learning_rate": 0.0002, "epoch": 2.1899736147757256, "step": 1660}, {"loss": 1.0557, "grad_norm": 0.4607589542865753, "learning_rate": 0.0002, "epoch": 2.203166226912929, "step": 1670}, {"loss": 1.0206, "grad_norm": 0.4489196836948395, "learning_rate": 0.0002, "epoch": 2.216358839050132, "step": 1680}, {"loss": 0.9657, "grad_norm": 0.49300864338874817, "learning_rate": 0.0002, "epoch": 2.229551451187335, "step": 1690}, {"loss": 0.8371, "grad_norm": 0.6624954342842102, "learning_rate": 0.0002, "epoch": 2.242744063324538, "step": 1700}, {"loss": 0.8555, "grad_norm": 0.8391500115394592, "learning_rate": 0.0002, "epoch": 2.2559366754617414, "step": 1710}, {"loss": 1.0113, "grad_norm": 0.5193073749542236, "learning_rate": 0.0002, "epoch": 2.2691292875989446, "step": 1720}, {"loss": 0.9979, "grad_norm": 0.6180613040924072, "learning_rate": 0.0002, "epoch": 2.282321899736148, "step": 1730}, {"loss": 0.9579, "grad_norm": 0.591191291809082, "learning_rate": 0.0002, "epoch": 2.295514511873351, "step": 1740}, {"loss": 0.8879, "grad_norm": 0.546897828578949, "learning_rate": 0.0002, "epoch": 2.308707124010554, "step": 1750}, {"loss": 0.9321, "grad_norm": 0.5470401644706726, "learning_rate": 0.0002, "epoch": 2.321899736147757, "step": 1760}, {"loss": 0.9104, "grad_norm": 0.4590282738208771, "learning_rate": 0.0002, "epoch": 2.3350923482849604, "step": 1770}, {"loss": 1.0384, "grad_norm": 0.622164785861969, "learning_rate": 0.0002, "epoch": 2.3482849604221636, "step": 1780}, {"loss": 1.0297, "grad_norm": 0.5753812193870544, "learning_rate": 0.0002, "epoch": 2.361477572559367, "step": 1790}, {"loss": 0.9569, "grad_norm": 0.47958624362945557, "learning_rate": 0.0002, "epoch": 2.37467018469657, "step": 1800}, {"loss": 1.0519, "grad_norm": 0.48042672872543335, "learning_rate": 0.0002, "epoch": 2.387862796833773, "step": 1810}, {"loss": 0.9042, "grad_norm": 0.44586366415023804, "learning_rate": 0.0002, "epoch": 2.401055408970976, "step": 1820}, {"loss": 0.9783, "grad_norm": 0.7239416837692261, "learning_rate": 0.0002, "epoch": 2.4142480211081794, "step": 1830}, {"loss": 0.8818, "grad_norm": 0.5515341758728027, "learning_rate": 0.0002, "epoch": 2.4274406332453826, "step": 1840}, {"loss": 0.9503, "grad_norm": 0.6280064582824707, "learning_rate": 0.0002, "epoch": 2.440633245382586, "step": 1850}, {"loss": 0.8943, "grad_norm": 0.4832057058811188, "learning_rate": 0.0002, "epoch": 2.453825857519789, "step": 1860}, {"loss": 0.8744, "grad_norm": 0.5789321064949036, "learning_rate": 0.0002, "epoch": 2.467018469656992, "step": 1870}, {"loss": 0.9332, "grad_norm": 0.48491886258125305, "learning_rate": 0.0002, "epoch": 2.480211081794195, "step": 1880}, {"loss": 0.943, "grad_norm": 0.532365620136261, "learning_rate": 0.0002, "epoch": 2.4934036939313984, "step": 1890}, {"loss": 0.9734, "grad_norm": 0.7087852954864502, "learning_rate": 0.0002, "epoch": 2.5065963060686016, "step": 1900}, {"loss": 0.9767, "grad_norm": 0.48157402873039246, "learning_rate": 0.0002, "epoch": 2.519788918205805, "step": 1910}, {"loss": 0.9851, "grad_norm": 0.5886041522026062, "learning_rate": 0.0002, "epoch": 2.532981530343008, "step": 1920}, {"loss": 1.0144, "grad_norm": 0.6332622766494751, "learning_rate": 0.0002, "epoch": 2.5461741424802113, "step": 1930}, {"loss": 0.9516, "grad_norm": 0.5463117957115173, "learning_rate": 0.0002, "epoch": 2.559366754617414, "step": 1940}, {"loss": 0.9373, "grad_norm": 0.5432228446006775, "learning_rate": 0.0002, "epoch": 2.5725593667546174, "step": 1950}, {"loss": 0.8974, "grad_norm": 0.5929186940193176, "learning_rate": 0.0002, "epoch": 2.5857519788918206, "step": 1960}, {"loss": 1.0062, "grad_norm": 0.5120641589164734, "learning_rate": 0.0002, "epoch": 2.598944591029024, "step": 1970}, {"loss": 0.9143, "grad_norm": 0.5372339487075806, "learning_rate": 0.0002, "epoch": 2.6121372031662267, "step": 1980}, {"loss": 1.0786, "grad_norm": 0.5519838929176331, "learning_rate": 0.0002, "epoch": 2.62532981530343, "step": 1990}, {"loss": 1.021, "grad_norm": 0.7304037809371948, "learning_rate": 0.0002, "epoch": 2.638522427440633, "step": 2000}, {"loss": 0.8708, "grad_norm": 0.6182340979576111, "learning_rate": 0.0002, "epoch": 2.6517150395778364, "step": 2010}, {"loss": 1.0501, "grad_norm": 0.4874444305896759, "learning_rate": 0.0002, "epoch": 2.6649076517150396, "step": 2020}, {"loss": 0.9612, "grad_norm": 0.5850239396095276, "learning_rate": 0.0002, "epoch": 2.678100263852243, "step": 2030}, {"loss": 0.9808, "grad_norm": 0.6495311856269836, "learning_rate": 0.0002, "epoch": 2.691292875989446, "step": 2040}, {"loss": 1.0187, "grad_norm": 1.002830147743225, "learning_rate": 0.0002, "epoch": 2.7044854881266494, "step": 2050}, {"loss": 1.0386, "grad_norm": 0.49076753854751587, "learning_rate": 0.0002, "epoch": 2.717678100263852, "step": 2060}, {"loss": 0.9416, "grad_norm": 0.4736326336860657, "learning_rate": 0.0002, "epoch": 2.7308707124010554, "step": 2070}, {"loss": 0.8424, "grad_norm": 0.5527601838111877, "learning_rate": 0.0002, "epoch": 2.7440633245382586, "step": 2080}, {"loss": 0.9149, "grad_norm": 0.7295718193054199, "learning_rate": 0.0002, "epoch": 2.757255936675462, "step": 2090}, {"loss": 0.9032, "grad_norm": 0.5437536835670471, "learning_rate": 0.0002, "epoch": 2.7704485488126647, "step": 2100}, {"loss": 0.8254, "grad_norm": 0.5997128486633301, "learning_rate": 0.0002, "epoch": 2.783641160949868, "step": 2110}, {"loss": 0.976, "grad_norm": 0.6498191356658936, "learning_rate": 0.0002, "epoch": 2.796833773087071, "step": 2120}, {"loss": 0.9543, "grad_norm": 0.5237268805503845, "learning_rate": 0.0002, "epoch": 2.8100263852242744, "step": 2130}, {"loss": 0.9302, "grad_norm": 0.6033027172088623, "learning_rate": 0.0002, "epoch": 2.8232189973614776, "step": 2140}, {"loss": 0.9625, "grad_norm": 0.6077138781547546, "learning_rate": 0.0002, "epoch": 2.836411609498681, "step": 2150}, {"loss": 0.9347, "grad_norm": 0.4127797484397888, "learning_rate": 0.0002, "epoch": 2.849604221635884, "step": 2160}, {"loss": 1.0459, "grad_norm": 0.8448635339736938, "learning_rate": 0.0002, "epoch": 2.862796833773087, "step": 2170}, {"loss": 0.8185, "grad_norm": 0.5669729113578796, "learning_rate": 0.0002, "epoch": 2.87598944591029, "step": 2180}, {"loss": 0.8555, "grad_norm": 0.510231077671051, "learning_rate": 0.0002, "epoch": 2.8891820580474934, "step": 2190}, {"loss": 0.9267, "grad_norm": 0.8072245121002197, "learning_rate": 0.0002, "epoch": 2.9023746701846966, "step": 2200}, {"loss": 0.9685, "grad_norm": 0.6055923104286194, "learning_rate": 0.0002, "epoch": 2.9155672823219, "step": 2210}, {"loss": 0.9157, "grad_norm": 0.7384416460990906, "learning_rate": 0.0002, "epoch": 2.9287598944591027, "step": 2220}, {"loss": 0.9712, "grad_norm": 0.4922751784324646, "learning_rate": 0.0002, "epoch": 2.941952506596306, "step": 2230}, {"loss": 0.9487, "grad_norm": 0.6039906740188599, "learning_rate": 0.0002, "epoch": 2.955145118733509, "step": 2240}, {"loss": 0.9776, "grad_norm": 0.4751701354980469, "learning_rate": 0.0002, "epoch": 2.9683377308707124, "step": 2250}, {"loss": 1.0619, "grad_norm": 0.5698353052139282, "learning_rate": 0.0002, "epoch": 2.9815303430079156, "step": 2260}, {"loss": 1.1184, "grad_norm": 0.893563449382782, "learning_rate": 0.0002, "epoch": 2.994722955145119, "step": 2270}, {"eval_loss": 1.2046419382095337, "eval_runtime": 71.5992, "eval_samples_per_second": 6.02, "eval_steps_per_second": 0.754, "epoch": 3.0, "step": 2274}, {"loss": 0.8269, "grad_norm": 0.41119325160980225, "learning_rate": 0.0002, "epoch": 3.007915567282322, "step": 2280}, {"loss": 0.7856, "grad_norm": 0.8169420957565308, "learning_rate": 0.0002, "epoch": 3.021108179419525, "step": 2290}, {"loss": 0.794, "grad_norm": 0.6033818125724792, "learning_rate": 0.0002, "epoch": 3.034300791556728, "step": 2300}, {"loss": 0.7607, "grad_norm": 0.9600058197975159, "learning_rate": 0.0002, "epoch": 3.0474934036939314, "step": 2310}, {"loss": 0.8353, "grad_norm": 0.5859250426292419, "learning_rate": 0.0002, "epoch": 3.0606860158311346, "step": 2320}, {"loss": 0.7598, "grad_norm": 0.6758618950843811, "learning_rate": 0.0002, "epoch": 3.073878627968338, "step": 2330}, {"loss": 0.7631, "grad_norm": 0.8407140970230103, "learning_rate": 0.0002, "epoch": 3.0870712401055407, "step": 2340}, {"loss": 0.7664, "grad_norm": 0.767779529094696, "learning_rate": 0.0002, "epoch": 3.100263852242744, "step": 2350}, {"loss": 0.7121, "grad_norm": 0.5572896599769592, "learning_rate": 0.0002, "epoch": 3.113456464379947, "step": 2360}, {"loss": 0.7419, "grad_norm": 0.5908368825912476, "learning_rate": 0.0002, "epoch": 3.1266490765171504, "step": 2370}, {"loss": 0.8024, "grad_norm": 0.8047826290130615, "learning_rate": 0.0002, "epoch": 3.1398416886543536, "step": 2380}, {"loss": 0.8686, "grad_norm": 0.8041718006134033, "learning_rate": 0.0002, "epoch": 3.153034300791557, "step": 2390}, {"loss": 0.668, "grad_norm": 0.57078617811203, "learning_rate": 0.0002, "epoch": 3.16622691292876, "step": 2400}, {"loss": 0.7976, "grad_norm": 0.5125322937965393, "learning_rate": 0.0002, "epoch": 3.179419525065963, "step": 2410}, {"loss": 0.741, "grad_norm": 0.6356934309005737, "learning_rate": 0.0002, "epoch": 3.192612137203166, "step": 2420}, {"loss": 0.687, "grad_norm": 1.0129680633544922, "learning_rate": 0.0002, "epoch": 3.2058047493403694, "step": 2430}, {"loss": 0.8316, "grad_norm": 0.8104226589202881, "learning_rate": 0.0002, "epoch": 3.2189973614775726, "step": 2440}, {"loss": 0.8343, "grad_norm": 0.7276079058647156, "learning_rate": 0.0002, "epoch": 3.232189973614776, "step": 2450}, {"loss": 0.8183, "grad_norm": 0.9753884077072144, "learning_rate": 0.0002, "epoch": 3.2453825857519787, "step": 2460}, {"loss": 0.7776, "grad_norm": 0.9753183722496033, "learning_rate": 0.0002, "epoch": 3.258575197889182, "step": 2470}, {"loss": 0.8815, "grad_norm": 0.6791225075721741, "learning_rate": 0.0002, "epoch": 3.271767810026385, "step": 2480}, {"loss": 0.7548, "grad_norm": 0.6797150373458862, "learning_rate": 0.0002, "epoch": 3.2849604221635884, "step": 2490}, {"loss": 0.8395, "grad_norm": 0.8107194900512695, "learning_rate": 0.0002, "epoch": 3.2981530343007917, "step": 2500}, {"loss": 0.7869, "grad_norm": 0.5878375172615051, "learning_rate": 0.0002, "epoch": 3.311345646437995, "step": 2510}, {"loss": 0.7992, "grad_norm": 0.5882975459098816, "learning_rate": 0.0002, "epoch": 3.324538258575198, "step": 2520}, {"loss": 0.7472, "grad_norm": 0.6180013418197632, "learning_rate": 0.0002, "epoch": 3.337730870712401, "step": 2530}, {"loss": 0.8033, "grad_norm": 1.0008151531219482, "learning_rate": 0.0002, "epoch": 3.350923482849604, "step": 2540}, {"loss": 0.8464, "grad_norm": 0.6404656767845154, "learning_rate": 0.0002, "epoch": 3.3641160949868074, "step": 2550}, {"loss": 0.7533, "grad_norm": 0.8481354117393494, "learning_rate": 0.0002, "epoch": 3.3773087071240107, "step": 2560}, {"loss": 0.7852, "grad_norm": 0.8068035244941711, "learning_rate": 0.0002, "epoch": 3.390501319261214, "step": 2570}, {"loss": 0.8621, "grad_norm": 0.7477166056632996, "learning_rate": 0.0002, "epoch": 3.4036939313984167, "step": 2580}, {"loss": 0.8352, "grad_norm": 0.6202635765075684, "learning_rate": 0.0002, "epoch": 3.41688654353562, "step": 2590}, {"loss": 0.7572, "grad_norm": 0.6981159448623657, "learning_rate": 0.0002, "epoch": 3.430079155672823, "step": 2600}, {"loss": 0.7846, "grad_norm": 0.6611084342002869, "learning_rate": 0.0002, "epoch": 3.4432717678100264, "step": 2610}, {"loss": 0.7503, "grad_norm": 0.5727696418762207, "learning_rate": 0.0002, "epoch": 3.4564643799472297, "step": 2620}, {"loss": 0.8427, "grad_norm": 1.2354545593261719, "learning_rate": 0.0002, "epoch": 3.469656992084433, "step": 2630}, {"loss": 0.7747, "grad_norm": 0.6347638368606567, "learning_rate": 0.0002, "epoch": 3.4828496042216357, "step": 2640}, {"loss": 0.8426, "grad_norm": 0.6975704431533813, "learning_rate": 0.0002, "epoch": 3.496042216358839, "step": 2650}, {"loss": 0.8773, "grad_norm": 0.6569573879241943, "learning_rate": 0.0002, "epoch": 3.509234828496042, "step": 2660}, {"loss": 0.7908, "grad_norm": 0.6979609131813049, "learning_rate": 0.0002, "epoch": 3.5224274406332454, "step": 2670}, {"loss": 0.8254, "grad_norm": 0.6287988424301147, "learning_rate": 0.0002, "epoch": 3.5356200527704487, "step": 2680}, {"loss": 0.7815, "grad_norm": 0.8682637214660645, "learning_rate": 0.0002, "epoch": 3.5488126649076515, "step": 2690}, {"loss": 0.7566, "grad_norm": 0.7062831521034241, "learning_rate": 0.0002, "epoch": 3.5620052770448547, "step": 2700}, {"loss": 0.713, "grad_norm": 1.0061452388763428, "learning_rate": 0.0002, "epoch": 3.575197889182058, "step": 2710}, {"loss": 0.7738, "grad_norm": 0.719097375869751, "learning_rate": 0.0002, "epoch": 3.588390501319261, "step": 2720}, {"loss": 0.8145, "grad_norm": 0.7583496570587158, "learning_rate": 0.0002, "epoch": 3.6015831134564644, "step": 2730}, {"loss": 0.91, "grad_norm": 0.7543531060218811, "learning_rate": 0.0002, "epoch": 3.6147757255936677, "step": 2740}, {"loss": 0.8325, "grad_norm": 0.8873646855354309, "learning_rate": 0.0002, "epoch": 3.627968337730871, "step": 2750}, {"loss": 0.7116, "grad_norm": 1.0657562017440796, "learning_rate": 0.0002, "epoch": 3.641160949868074, "step": 2760}, {"loss": 0.8291, "grad_norm": 0.8641113638877869, "learning_rate": 0.0002, "epoch": 3.654353562005277, "step": 2770}, {"loss": 0.8302, "grad_norm": 0.6620645523071289, "learning_rate": 0.0002, "epoch": 3.66754617414248, "step": 2780}, {"loss": 0.8261, "grad_norm": 0.6919541954994202, "learning_rate": 0.0002, "epoch": 3.6807387862796834, "step": 2790}, {"loss": 0.8388, "grad_norm": 0.7305743098258972, "learning_rate": 0.0002, "epoch": 3.6939313984168867, "step": 2800}, {"loss": 0.8053, "grad_norm": 0.7464777827262878, "learning_rate": 0.0002, "epoch": 3.7071240105540895, "step": 2810}, {"loss": 0.8019, "grad_norm": 0.8067063093185425, "learning_rate": 0.0002, "epoch": 3.7203166226912927, "step": 2820}, {"loss": 0.8259, "grad_norm": 0.7789416313171387, "learning_rate": 0.0002, "epoch": 3.733509234828496, "step": 2830}, {"loss": 0.774, "grad_norm": 0.507529079914093, "learning_rate": 0.0002, "epoch": 3.746701846965699, "step": 2840}, {"loss": 0.832, "grad_norm": 0.6509260535240173, "learning_rate": 0.0002, "epoch": 3.7598944591029024, "step": 2850}, {"loss": 0.8257, "grad_norm": 0.9141367673873901, "learning_rate": 0.0002, "epoch": 3.7730870712401057, "step": 2860}, {"loss": 0.9436, "grad_norm": 0.7852635979652405, "learning_rate": 0.0002, "epoch": 3.786279683377309, "step": 2870}, {"loss": 0.8842, "grad_norm": 0.5340318083763123, "learning_rate": 0.0002, "epoch": 3.7994722955145117, "step": 2880}, {"loss": 0.7468, "grad_norm": 0.6246042847633362, "learning_rate": 0.0002, "epoch": 3.812664907651715, "step": 2890}, {"loss": 0.8184, "grad_norm": 0.7064066529273987, "learning_rate": 0.0002, "epoch": 3.825857519788918, "step": 2900}, {"loss": 0.8515, "grad_norm": 0.6144065856933594, "learning_rate": 0.0002, "epoch": 3.8390501319261214, "step": 2910}, {"loss": 0.7484, "grad_norm": 0.5268424153327942, "learning_rate": 0.0002, "epoch": 3.8522427440633247, "step": 2920}, {"loss": 0.7594, "grad_norm": 0.9508116841316223, "learning_rate": 0.0002, "epoch": 3.8654353562005275, "step": 2930}, {"loss": 0.8437, "grad_norm": 0.9133715629577637, "learning_rate": 0.0002, "epoch": 3.8786279683377307, "step": 2940}, {"loss": 0.8611, "grad_norm": 1.0144646167755127, "learning_rate": 0.0002, "epoch": 3.891820580474934, "step": 2950}, {"loss": 0.8043, "grad_norm": 0.6397877931594849, "learning_rate": 0.0002, "epoch": 3.905013192612137, "step": 2960}, {"loss": 0.8285, "grad_norm": 0.734835147857666, "learning_rate": 0.0002, "epoch": 3.9182058047493404, "step": 2970}, {"loss": 0.7831, "grad_norm": 0.784853994846344, "learning_rate": 0.0002, "epoch": 3.9313984168865437, "step": 2980}, {"loss": 0.8148, "grad_norm": 0.805831789970398, "learning_rate": 0.0002, "epoch": 3.944591029023747, "step": 2990}, {"loss": 0.8252, "grad_norm": 0.6299595236778259, "learning_rate": 0.0002, "epoch": 3.9577836411609497, "step": 3000}, {"loss": 0.8244, "grad_norm": 0.6264058351516724, "learning_rate": 0.0002, "epoch": 3.970976253298153, "step": 3010}, {"loss": 0.8185, "grad_norm": 0.6419739723205566, "learning_rate": 0.0002, "epoch": 3.984168865435356, "step": 3020}, {"loss": 0.8174, "grad_norm": 0.7737036943435669, "learning_rate": 0.0002, "epoch": 3.9973614775725594, "step": 3030}]} +{"epoch": 5.0, "step": 3790, "epoch_duration": 2077.5867722034454, "total_accumulated_duration": 11089.828041553497, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9466, "grad_norm": 0.7545632123947144, "learning_rate": 0.0002, "epoch": 0.013192612137203167, "step": 10}, {"loss": 1.4909, "grad_norm": 0.5787661075592041, "learning_rate": 0.0002, "epoch": 0.026385224274406333, "step": 20}, {"loss": 1.3906, "grad_norm": 0.8616093993186951, "learning_rate": 0.0002, "epoch": 0.0395778364116095, "step": 30}, {"loss": 1.4116, "grad_norm": 0.42088547348976135, "learning_rate": 0.0002, "epoch": 0.052770448548812667, "step": 40}, {"loss": 1.3446, "grad_norm": 0.47704678773880005, "learning_rate": 0.0002, "epoch": 0.06596306068601583, "step": 50}, {"loss": 1.2476, "grad_norm": 0.5763994455337524, "learning_rate": 0.0002, "epoch": 0.079155672823219, "step": 60}, {"loss": 1.2268, "grad_norm": 0.4579846262931824, "learning_rate": 0.0002, "epoch": 0.09234828496042216, "step": 70}, {"loss": 1.415, "grad_norm": 0.46623846888542175, "learning_rate": 0.0002, "epoch": 0.10554089709762533, "step": 80}, {"loss": 1.2849, "grad_norm": 0.4206956624984741, "learning_rate": 0.0002, "epoch": 0.11873350923482849, "step": 90}, {"loss": 1.1608, "grad_norm": 0.41896629333496094, "learning_rate": 0.0002, "epoch": 0.13192612137203166, "step": 100}, {"loss": 1.2617, "grad_norm": 0.3459089398384094, "learning_rate": 0.0002, "epoch": 0.14511873350923482, "step": 110}, {"loss": 1.2858, "grad_norm": 0.4587327837944031, "learning_rate": 0.0002, "epoch": 0.158311345646438, "step": 120}, {"loss": 1.2241, "grad_norm": 0.433525413274765, "learning_rate": 0.0002, "epoch": 0.17150395778364116, "step": 130}, {"loss": 1.2269, "grad_norm": 0.39253175258636475, "learning_rate": 0.0002, "epoch": 0.18469656992084432, "step": 140}, {"loss": 1.2224, "grad_norm": 0.3602290749549866, "learning_rate": 0.0002, "epoch": 0.19788918205804748, "step": 150}, {"loss": 1.2834, "grad_norm": 0.41160839796066284, "learning_rate": 0.0002, "epoch": 0.21108179419525067, "step": 160}, {"loss": 1.1986, "grad_norm": 0.7213630080223083, "learning_rate": 0.0002, "epoch": 0.22427440633245382, "step": 170}, {"loss": 1.2215, "grad_norm": 0.39086055755615234, "learning_rate": 0.0002, "epoch": 0.23746701846965698, "step": 180}, {"loss": 1.3315, "grad_norm": 0.4465520977973938, "learning_rate": 0.0002, "epoch": 0.25065963060686014, "step": 190}, {"loss": 1.3798, "grad_norm": 1.814679741859436, "learning_rate": 0.0002, "epoch": 0.2638522427440633, "step": 200}, {"loss": 1.2259, "grad_norm": 0.5026423931121826, "learning_rate": 0.0002, "epoch": 0.2770448548812665, "step": 210}, {"loss": 1.3306, "grad_norm": 0.4156292974948883, "learning_rate": 0.0002, "epoch": 0.29023746701846964, "step": 220}, {"loss": 1.266, "grad_norm": 0.40813493728637695, "learning_rate": 0.0002, "epoch": 0.3034300791556728, "step": 230}, {"loss": 1.1533, "grad_norm": 0.3304787874221802, "learning_rate": 0.0002, "epoch": 0.316622691292876, "step": 240}, {"loss": 1.3154, "grad_norm": 0.46139976382255554, "learning_rate": 0.0002, "epoch": 0.32981530343007914, "step": 250}, {"loss": 1.1365, "grad_norm": 0.37518271803855896, "learning_rate": 0.0002, "epoch": 0.34300791556728233, "step": 260}, {"loss": 1.2004, "grad_norm": 0.35586467385292053, "learning_rate": 0.0002, "epoch": 0.3562005277044855, "step": 270}, {"loss": 1.2609, "grad_norm": 0.32441186904907227, "learning_rate": 0.0002, "epoch": 0.36939313984168864, "step": 280}, {"loss": 1.212, "grad_norm": 0.3198683261871338, "learning_rate": 0.0002, "epoch": 0.38258575197889183, "step": 290}, {"loss": 1.2465, "grad_norm": 0.33663108944892883, "learning_rate": 0.0002, "epoch": 0.39577836411609496, "step": 300}, {"loss": 1.1702, "grad_norm": 0.3711244761943817, "learning_rate": 0.0002, "epoch": 0.40897097625329815, "step": 310}, {"loss": 1.0871, "grad_norm": 0.3209651708602905, "learning_rate": 0.0002, "epoch": 0.42216358839050133, "step": 320}, {"loss": 1.2728, "grad_norm": 0.5152716040611267, "learning_rate": 0.0002, "epoch": 0.43535620052770446, "step": 330}, {"loss": 1.1833, "grad_norm": 0.5431376695632935, "learning_rate": 0.0002, "epoch": 0.44854881266490765, "step": 340}, {"loss": 1.1344, "grad_norm": 0.3069997727870941, "learning_rate": 0.0002, "epoch": 0.46174142480211083, "step": 350}, {"loss": 1.2742, "grad_norm": 0.34260064363479614, "learning_rate": 0.0002, "epoch": 0.47493403693931396, "step": 360}, {"loss": 1.3161, "grad_norm": 0.345653235912323, "learning_rate": 0.0002, "epoch": 0.48812664907651715, "step": 370}, {"loss": 1.1552, "grad_norm": 0.46222734451293945, "learning_rate": 0.0002, "epoch": 0.5013192612137203, "step": 380}, {"loss": 1.0616, "grad_norm": 0.27301734685897827, "learning_rate": 0.0002, "epoch": 0.5145118733509235, "step": 390}, {"loss": 1.1944, "grad_norm": 0.29048439860343933, "learning_rate": 0.0002, "epoch": 0.5277044854881267, "step": 400}, {"loss": 1.0956, "grad_norm": 0.32927802205085754, "learning_rate": 0.0002, "epoch": 0.5408970976253298, "step": 410}, {"loss": 1.2362, "grad_norm": 0.3336397409439087, "learning_rate": 0.0002, "epoch": 0.554089709762533, "step": 420}, {"loss": 1.1445, "grad_norm": 0.4007597267627716, "learning_rate": 0.0002, "epoch": 0.5672823218997362, "step": 430}, {"loss": 1.2731, "grad_norm": 0.36144956946372986, "learning_rate": 0.0002, "epoch": 0.5804749340369393, "step": 440}, {"loss": 1.1604, "grad_norm": 0.6331009864807129, "learning_rate": 0.0002, "epoch": 0.5936675461741425, "step": 450}, {"loss": 1.1692, "grad_norm": 0.41469088196754456, "learning_rate": 0.0002, "epoch": 0.6068601583113457, "step": 460}, {"loss": 1.2372, "grad_norm": 0.4388185143470764, "learning_rate": 0.0002, "epoch": 0.6200527704485488, "step": 470}, {"loss": 1.2541, "grad_norm": 0.3738141655921936, "learning_rate": 0.0002, "epoch": 0.633245382585752, "step": 480}, {"loss": 1.1265, "grad_norm": 0.7212023138999939, "learning_rate": 0.0002, "epoch": 0.6464379947229552, "step": 490}, {"loss": 1.2786, "grad_norm": 0.2972351014614105, "learning_rate": 0.0002, "epoch": 0.6596306068601583, "step": 500}, {"loss": 1.1739, "grad_norm": 0.45293179154396057, "learning_rate": 0.0002, "epoch": 0.6728232189973615, "step": 510}, {"loss": 1.255, "grad_norm": 0.4319860637187958, "learning_rate": 0.0002, "epoch": 0.6860158311345647, "step": 520}, {"loss": 1.0959, "grad_norm": 0.3050215542316437, "learning_rate": 0.0002, "epoch": 0.6992084432717678, "step": 530}, {"loss": 1.1608, "grad_norm": 0.3552611172199249, "learning_rate": 0.0002, "epoch": 0.712401055408971, "step": 540}, {"loss": 1.2546, "grad_norm": 0.3631151020526886, "learning_rate": 0.0002, "epoch": 0.7255936675461742, "step": 550}, {"loss": 1.1371, "grad_norm": 0.28177931904792786, "learning_rate": 0.0002, "epoch": 0.7387862796833773, "step": 560}, {"loss": 1.2081, "grad_norm": 0.359764039516449, "learning_rate": 0.0002, "epoch": 0.7519788918205804, "step": 570}, {"loss": 1.1356, "grad_norm": 0.3970327377319336, "learning_rate": 0.0002, "epoch": 0.7651715039577837, "step": 580}, {"loss": 1.309, "grad_norm": 0.3541001081466675, "learning_rate": 0.0002, "epoch": 0.7783641160949868, "step": 590}, {"loss": 1.2061, "grad_norm": 0.3478573262691498, "learning_rate": 0.0002, "epoch": 0.7915567282321899, "step": 600}, {"loss": 1.1864, "grad_norm": 0.3900321424007416, "learning_rate": 0.0002, "epoch": 0.8047493403693932, "step": 610}, {"loss": 1.1358, "grad_norm": 0.3443238437175751, "learning_rate": 0.0002, "epoch": 0.8179419525065963, "step": 620}, {"loss": 1.3232, "grad_norm": 0.44238781929016113, "learning_rate": 0.0002, "epoch": 0.8311345646437994, "step": 630}, {"loss": 1.1247, "grad_norm": 0.36339467763900757, "learning_rate": 0.0002, "epoch": 0.8443271767810027, "step": 640}, {"loss": 1.1049, "grad_norm": 0.6243070363998413, "learning_rate": 0.0002, "epoch": 0.8575197889182058, "step": 650}, {"loss": 1.1943, "grad_norm": 0.3209173381328583, "learning_rate": 0.0002, "epoch": 0.8707124010554089, "step": 660}, {"loss": 1.1927, "grad_norm": 0.35017991065979004, "learning_rate": 0.0002, "epoch": 0.8839050131926122, "step": 670}, {"loss": 1.1603, "grad_norm": 0.3247159421443939, "learning_rate": 0.0002, "epoch": 0.8970976253298153, "step": 680}, {"loss": 1.2298, "grad_norm": 0.4091894030570984, "learning_rate": 0.0002, "epoch": 0.9102902374670184, "step": 690}, {"loss": 1.2756, "grad_norm": 0.3975585997104645, "learning_rate": 0.0002, "epoch": 0.9234828496042217, "step": 700}, {"loss": 1.281, "grad_norm": 0.3666245937347412, "learning_rate": 0.0002, "epoch": 0.9366754617414248, "step": 710}, {"loss": 1.2855, "grad_norm": 0.45216917991638184, "learning_rate": 0.0002, "epoch": 0.9498680738786279, "step": 720}, {"loss": 1.2217, "grad_norm": 0.36108118295669556, "learning_rate": 0.0002, "epoch": 0.9630606860158312, "step": 730}, {"loss": 1.17, "grad_norm": 0.44550251960754395, "learning_rate": 0.0002, "epoch": 0.9762532981530343, "step": 740}, {"loss": 1.0672, "grad_norm": 0.29801255464553833, "learning_rate": 0.0002, "epoch": 0.9894459102902374, "step": 750}, {"eval_loss": 1.203244686126709, "eval_runtime": 76.0457, "eval_samples_per_second": 5.668, "eval_steps_per_second": 0.71, "epoch": 1.0, "step": 758}, {"loss": 1.0748, "grad_norm": 0.4096551239490509, "learning_rate": 0.0002, "epoch": 1.0026385224274406, "step": 760}, {"loss": 1.1537, "grad_norm": 0.2649582326412201, "learning_rate": 0.0002, "epoch": 1.0158311345646438, "step": 770}, {"loss": 1.1676, "grad_norm": 0.3100722134113312, "learning_rate": 0.0002, "epoch": 1.029023746701847, "step": 780}, {"loss": 1.1736, "grad_norm": 0.3911755383014679, "learning_rate": 0.0002, "epoch": 1.04221635883905, "step": 790}, {"loss": 1.0354, "grad_norm": 0.4600953757762909, "learning_rate": 0.0002, "epoch": 1.0554089709762533, "step": 800}, {"loss": 1.2869, "grad_norm": 0.28671619296073914, "learning_rate": 0.0002, "epoch": 1.0686015831134565, "step": 810}, {"loss": 1.1441, "grad_norm": 0.47282642126083374, "learning_rate": 0.0002, "epoch": 1.0817941952506596, "step": 820}, {"loss": 1.1368, "grad_norm": 0.690073549747467, "learning_rate": 0.0002, "epoch": 1.0949868073878628, "step": 830}, {"loss": 0.9944, "grad_norm": 0.7317902445793152, "learning_rate": 0.0002, "epoch": 1.108179419525066, "step": 840}, {"loss": 1.108, "grad_norm": 0.44215938448905945, "learning_rate": 0.0002, "epoch": 1.121372031662269, "step": 850}, {"loss": 1.0558, "grad_norm": 0.33875149488449097, "learning_rate": 0.0002, "epoch": 1.1345646437994723, "step": 860}, {"loss": 1.0471, "grad_norm": 0.3700002431869507, "learning_rate": 0.0002, "epoch": 1.1477572559366755, "step": 870}, {"loss": 1.0121, "grad_norm": 0.41173291206359863, "learning_rate": 0.0002, "epoch": 1.1609498680738786, "step": 880}, {"loss": 1.0312, "grad_norm": 0.5253589749336243, "learning_rate": 0.0002, "epoch": 1.1741424802110818, "step": 890}, {"loss": 1.2238, "grad_norm": 0.3912237286567688, "learning_rate": 0.0002, "epoch": 1.187335092348285, "step": 900}, {"loss": 0.9434, "grad_norm": 0.40990331768989563, "learning_rate": 0.0002, "epoch": 1.200527704485488, "step": 910}, {"loss": 1.0493, "grad_norm": 0.40377968549728394, "learning_rate": 0.0002, "epoch": 1.2137203166226913, "step": 920}, {"loss": 1.1496, "grad_norm": 0.4605846405029297, "learning_rate": 0.0002, "epoch": 1.2269129287598945, "step": 930}, {"loss": 1.1173, "grad_norm": 0.31564897298812866, "learning_rate": 0.0002, "epoch": 1.2401055408970976, "step": 940}, {"loss": 1.0547, "grad_norm": 0.39808550477027893, "learning_rate": 0.0002, "epoch": 1.2532981530343008, "step": 950}, {"loss": 1.1367, "grad_norm": 0.3762115240097046, "learning_rate": 0.0002, "epoch": 1.266490765171504, "step": 960}, {"loss": 1.1596, "grad_norm": 0.4174984097480774, "learning_rate": 0.0002, "epoch": 1.279683377308707, "step": 970}, {"loss": 1.1327, "grad_norm": 0.5263054966926575, "learning_rate": 0.0002, "epoch": 1.2928759894459103, "step": 980}, {"loss": 1.0339, "grad_norm": 0.41673699021339417, "learning_rate": 0.0002, "epoch": 1.3060686015831133, "step": 990}, {"loss": 1.1198, "grad_norm": 0.9613684415817261, "learning_rate": 0.0002, "epoch": 1.3192612137203166, "step": 1000}, {"loss": 1.0444, "grad_norm": 0.3690216839313507, "learning_rate": 0.0002, "epoch": 1.3324538258575198, "step": 1010}, {"loss": 1.0473, "grad_norm": 0.521821141242981, "learning_rate": 0.0002, "epoch": 1.345646437994723, "step": 1020}, {"loss": 1.1065, "grad_norm": 0.3353094160556793, "learning_rate": 0.0002, "epoch": 1.358839050131926, "step": 1030}, {"loss": 1.1286, "grad_norm": 0.3843843936920166, "learning_rate": 0.0002, "epoch": 1.3720316622691293, "step": 1040}, {"loss": 1.1369, "grad_norm": 0.372514545917511, "learning_rate": 0.0002, "epoch": 1.3852242744063323, "step": 1050}, {"loss": 1.0041, "grad_norm": 0.34537771344184875, "learning_rate": 0.0002, "epoch": 1.3984168865435356, "step": 1060}, {"loss": 1.0251, "grad_norm": 0.45349085330963135, "learning_rate": 0.0002, "epoch": 1.4116094986807388, "step": 1070}, {"loss": 1.1873, "grad_norm": 0.5120177268981934, "learning_rate": 0.0002, "epoch": 1.424802110817942, "step": 1080}, {"loss": 1.0872, "grad_norm": 0.42800238728523254, "learning_rate": 0.0002, "epoch": 1.437994722955145, "step": 1090}, {"loss": 1.0734, "grad_norm": 0.343832790851593, "learning_rate": 0.0002, "epoch": 1.4511873350923483, "step": 1100}, {"loss": 1.1286, "grad_norm": 0.3829841911792755, "learning_rate": 0.0002, "epoch": 1.4643799472295513, "step": 1110}, {"loss": 1.1268, "grad_norm": 0.4289931058883667, "learning_rate": 0.0002, "epoch": 1.4775725593667546, "step": 1120}, {"loss": 1.0676, "grad_norm": 0.42750850319862366, "learning_rate": 0.0002, "epoch": 1.4907651715039578, "step": 1130}, {"loss": 1.072, "grad_norm": 0.34328413009643555, "learning_rate": 0.0002, "epoch": 1.503957783641161, "step": 1140}, {"loss": 1.0863, "grad_norm": 0.349096417427063, "learning_rate": 0.0002, "epoch": 1.517150395778364, "step": 1150}, {"loss": 1.1462, "grad_norm": 0.7700717449188232, "learning_rate": 0.0002, "epoch": 1.5303430079155673, "step": 1160}, {"loss": 0.9528, "grad_norm": 0.39294949173927307, "learning_rate": 0.0002, "epoch": 1.5435356200527703, "step": 1170}, {"loss": 1.1603, "grad_norm": 0.36173608899116516, "learning_rate": 0.0002, "epoch": 1.5567282321899736, "step": 1180}, {"loss": 1.1508, "grad_norm": 0.6034277677536011, "learning_rate": 0.0002, "epoch": 1.5699208443271768, "step": 1190}, {"loss": 1.0105, "grad_norm": 0.36694103479385376, "learning_rate": 0.0002, "epoch": 1.58311345646438, "step": 1200}, {"loss": 1.1479, "grad_norm": 0.4727209508419037, "learning_rate": 0.0002, "epoch": 1.596306068601583, "step": 1210}, {"loss": 1.0689, "grad_norm": 0.6482883095741272, "learning_rate": 0.0002, "epoch": 1.6094986807387863, "step": 1220}, {"loss": 1.1405, "grad_norm": 0.5238035917282104, "learning_rate": 0.0002, "epoch": 1.6226912928759893, "step": 1230}, {"loss": 1.0596, "grad_norm": 0.4812222421169281, "learning_rate": 0.0002, "epoch": 1.6358839050131926, "step": 1240}, {"loss": 1.2729, "grad_norm": 0.7131702303886414, "learning_rate": 0.0002, "epoch": 1.6490765171503958, "step": 1250}, {"loss": 0.9832, "grad_norm": 0.3803327977657318, "learning_rate": 0.0002, "epoch": 1.662269129287599, "step": 1260}, {"loss": 1.1433, "grad_norm": 0.3745088577270508, "learning_rate": 0.0002, "epoch": 1.675461741424802, "step": 1270}, {"loss": 1.1018, "grad_norm": 0.4427378475666046, "learning_rate": 0.0002, "epoch": 1.6886543535620053, "step": 1280}, {"loss": 1.0619, "grad_norm": 0.797478973865509, "learning_rate": 0.0002, "epoch": 1.7018469656992083, "step": 1290}, {"loss": 1.0853, "grad_norm": 0.503620982170105, "learning_rate": 0.0002, "epoch": 1.7150395778364116, "step": 1300}, {"loss": 1.1324, "grad_norm": 0.4132426381111145, "learning_rate": 0.0002, "epoch": 1.7282321899736148, "step": 1310}, {"loss": 1.172, "grad_norm": 0.41811656951904297, "learning_rate": 0.0002, "epoch": 1.741424802110818, "step": 1320}, {"loss": 1.0903, "grad_norm": 0.40647849440574646, "learning_rate": 0.0002, "epoch": 1.754617414248021, "step": 1330}, {"loss": 1.1316, "grad_norm": 0.42138347029685974, "learning_rate": 0.0002, "epoch": 1.767810026385224, "step": 1340}, {"loss": 0.9955, "grad_norm": 0.46523579955101013, "learning_rate": 0.0002, "epoch": 1.7810026385224274, "step": 1350}, {"loss": 1.0307, "grad_norm": 0.39760419726371765, "learning_rate": 0.0002, "epoch": 1.7941952506596306, "step": 1360}, {"loss": 1.1218, "grad_norm": 0.37993717193603516, "learning_rate": 0.0002, "epoch": 1.8073878627968338, "step": 1370}, {"loss": 1.0921, "grad_norm": 0.5404181480407715, "learning_rate": 0.0002, "epoch": 1.820580474934037, "step": 1380}, {"loss": 1.0903, "grad_norm": 0.4385245740413666, "learning_rate": 0.0002, "epoch": 1.83377308707124, "step": 1390}, {"loss": 1.1175, "grad_norm": 0.529797375202179, "learning_rate": 0.0002, "epoch": 1.8469656992084431, "step": 1400}, {"loss": 1.0577, "grad_norm": 0.481567919254303, "learning_rate": 0.0002, "epoch": 1.8601583113456464, "step": 1410}, {"loss": 1.0581, "grad_norm": 0.34787362813949585, "learning_rate": 0.0002, "epoch": 1.8733509234828496, "step": 1420}, {"loss": 1.0536, "grad_norm": 0.6402362585067749, "learning_rate": 0.0002, "epoch": 1.8865435356200528, "step": 1430}, {"loss": 1.0787, "grad_norm": 0.3461322784423828, "learning_rate": 0.0002, "epoch": 1.899736147757256, "step": 1440}, {"loss": 1.0925, "grad_norm": 0.44005653262138367, "learning_rate": 0.0002, "epoch": 1.912928759894459, "step": 1450}, {"loss": 1.0414, "grad_norm": 0.4064280688762665, "learning_rate": 0.0002, "epoch": 1.9261213720316621, "step": 1460}, {"loss": 1.0608, "grad_norm": 0.5236523151397705, "learning_rate": 0.0002, "epoch": 1.9393139841688654, "step": 1470}, {"loss": 1.0572, "grad_norm": 0.41030219197273254, "learning_rate": 0.0002, "epoch": 1.9525065963060686, "step": 1480}, {"loss": 1.1204, "grad_norm": 0.39805835485458374, "learning_rate": 0.0002, "epoch": 1.9656992084432718, "step": 1490}, {"loss": 1.0364, "grad_norm": 0.42974501848220825, "learning_rate": 0.0002, "epoch": 1.978891820580475, "step": 1500}, {"loss": 1.1169, "grad_norm": 0.4688243865966797, "learning_rate": 0.0002, "epoch": 1.992084432717678, "step": 1510}, {"eval_loss": 1.1874967813491821, "eval_runtime": 71.9523, "eval_samples_per_second": 5.99, "eval_steps_per_second": 0.75, "epoch": 2.0, "step": 1516}, {"loss": 1.0401, "grad_norm": 0.4121631383895874, "learning_rate": 0.0002, "epoch": 2.005277044854881, "step": 1520}, {"loss": 0.9384, "grad_norm": 0.4844197928905487, "learning_rate": 0.0002, "epoch": 2.0184696569920844, "step": 1530}, {"loss": 0.8686, "grad_norm": 0.45408546924591064, "learning_rate": 0.0002, "epoch": 2.0316622691292876, "step": 1540}, {"loss": 1.0399, "grad_norm": 0.48662951588630676, "learning_rate": 0.0002, "epoch": 2.044854881266491, "step": 1550}, {"loss": 0.936, "grad_norm": 0.7195899486541748, "learning_rate": 0.0002, "epoch": 2.058047493403694, "step": 1560}, {"loss": 0.9486, "grad_norm": 0.5071077346801758, "learning_rate": 0.0002, "epoch": 2.0712401055408973, "step": 1570}, {"loss": 1.0055, "grad_norm": 0.7473958730697632, "learning_rate": 0.0002, "epoch": 2.0844327176781, "step": 1580}, {"loss": 0.8309, "grad_norm": 0.5509232878684998, "learning_rate": 0.0002, "epoch": 2.0976253298153034, "step": 1590}, {"loss": 0.9181, "grad_norm": 0.5108042359352112, "learning_rate": 0.0002, "epoch": 2.1108179419525066, "step": 1600}, {"loss": 0.9499, "grad_norm": 0.42331448197364807, "learning_rate": 0.0002, "epoch": 2.12401055408971, "step": 1610}, {"loss": 0.9359, "grad_norm": 0.46621623635292053, "learning_rate": 0.0002, "epoch": 2.137203166226913, "step": 1620}, {"loss": 0.9065, "grad_norm": 0.43802836537361145, "learning_rate": 0.0002, "epoch": 2.150395778364116, "step": 1630}, {"loss": 0.9375, "grad_norm": 0.49908021092414856, "learning_rate": 0.0002, "epoch": 2.163588390501319, "step": 1640}, {"loss": 1.0389, "grad_norm": 0.4195636808872223, "learning_rate": 0.0002, "epoch": 2.1767810026385224, "step": 1650}, {"loss": 0.8501, "grad_norm": 0.49515822529792786, "learning_rate": 0.0002, "epoch": 2.1899736147757256, "step": 1660}, {"loss": 1.0557, "grad_norm": 0.4607589542865753, "learning_rate": 0.0002, "epoch": 2.203166226912929, "step": 1670}, {"loss": 1.0206, "grad_norm": 0.4489196836948395, "learning_rate": 0.0002, "epoch": 2.216358839050132, "step": 1680}, {"loss": 0.9657, "grad_norm": 0.49300864338874817, "learning_rate": 0.0002, "epoch": 2.229551451187335, "step": 1690}, {"loss": 0.8371, "grad_norm": 0.6624954342842102, "learning_rate": 0.0002, "epoch": 2.242744063324538, "step": 1700}, {"loss": 0.8555, "grad_norm": 0.8391500115394592, "learning_rate": 0.0002, "epoch": 2.2559366754617414, "step": 1710}, {"loss": 1.0113, "grad_norm": 0.5193073749542236, "learning_rate": 0.0002, "epoch": 2.2691292875989446, "step": 1720}, {"loss": 0.9979, "grad_norm": 0.6180613040924072, "learning_rate": 0.0002, "epoch": 2.282321899736148, "step": 1730}, {"loss": 0.9579, "grad_norm": 0.591191291809082, "learning_rate": 0.0002, "epoch": 2.295514511873351, "step": 1740}, {"loss": 0.8879, "grad_norm": 0.546897828578949, "learning_rate": 0.0002, "epoch": 2.308707124010554, "step": 1750}, {"loss": 0.9321, "grad_norm": 0.5470401644706726, "learning_rate": 0.0002, "epoch": 2.321899736147757, "step": 1760}, {"loss": 0.9104, "grad_norm": 0.4590282738208771, "learning_rate": 0.0002, "epoch": 2.3350923482849604, "step": 1770}, {"loss": 1.0384, "grad_norm": 0.622164785861969, "learning_rate": 0.0002, "epoch": 2.3482849604221636, "step": 1780}, {"loss": 1.0297, "grad_norm": 0.5753812193870544, "learning_rate": 0.0002, "epoch": 2.361477572559367, "step": 1790}, {"loss": 0.9569, "grad_norm": 0.47958624362945557, "learning_rate": 0.0002, "epoch": 2.37467018469657, "step": 1800}, {"loss": 1.0519, "grad_norm": 0.48042672872543335, "learning_rate": 0.0002, "epoch": 2.387862796833773, "step": 1810}, {"loss": 0.9042, "grad_norm": 0.44586366415023804, "learning_rate": 0.0002, "epoch": 2.401055408970976, "step": 1820}, {"loss": 0.9783, "grad_norm": 0.7239416837692261, "learning_rate": 0.0002, "epoch": 2.4142480211081794, "step": 1830}, {"loss": 0.8818, "grad_norm": 0.5515341758728027, "learning_rate": 0.0002, "epoch": 2.4274406332453826, "step": 1840}, {"loss": 0.9503, "grad_norm": 0.6280064582824707, "learning_rate": 0.0002, "epoch": 2.440633245382586, "step": 1850}, {"loss": 0.8943, "grad_norm": 0.4832057058811188, "learning_rate": 0.0002, "epoch": 2.453825857519789, "step": 1860}, {"loss": 0.8744, "grad_norm": 0.5789321064949036, "learning_rate": 0.0002, "epoch": 2.467018469656992, "step": 1870}, {"loss": 0.9332, "grad_norm": 0.48491886258125305, "learning_rate": 0.0002, "epoch": 2.480211081794195, "step": 1880}, {"loss": 0.943, "grad_norm": 0.532365620136261, "learning_rate": 0.0002, "epoch": 2.4934036939313984, "step": 1890}, {"loss": 0.9734, "grad_norm": 0.7087852954864502, "learning_rate": 0.0002, "epoch": 2.5065963060686016, "step": 1900}, {"loss": 0.9767, "grad_norm": 0.48157402873039246, "learning_rate": 0.0002, "epoch": 2.519788918205805, "step": 1910}, {"loss": 0.9851, "grad_norm": 0.5886041522026062, "learning_rate": 0.0002, "epoch": 2.532981530343008, "step": 1920}, {"loss": 1.0144, "grad_norm": 0.6332622766494751, "learning_rate": 0.0002, "epoch": 2.5461741424802113, "step": 1930}, {"loss": 0.9516, "grad_norm": 0.5463117957115173, "learning_rate": 0.0002, "epoch": 2.559366754617414, "step": 1940}, {"loss": 0.9373, "grad_norm": 0.5432228446006775, "learning_rate": 0.0002, "epoch": 2.5725593667546174, "step": 1950}, {"loss": 0.8974, "grad_norm": 0.5929186940193176, "learning_rate": 0.0002, "epoch": 2.5857519788918206, "step": 1960}, {"loss": 1.0062, "grad_norm": 0.5120641589164734, "learning_rate": 0.0002, "epoch": 2.598944591029024, "step": 1970}, {"loss": 0.9143, "grad_norm": 0.5372339487075806, "learning_rate": 0.0002, "epoch": 2.6121372031662267, "step": 1980}, {"loss": 1.0786, "grad_norm": 0.5519838929176331, "learning_rate": 0.0002, "epoch": 2.62532981530343, "step": 1990}, {"loss": 1.021, "grad_norm": 0.7304037809371948, "learning_rate": 0.0002, "epoch": 2.638522427440633, "step": 2000}, {"loss": 0.8708, "grad_norm": 0.6182340979576111, "learning_rate": 0.0002, "epoch": 2.6517150395778364, "step": 2010}, {"loss": 1.0501, "grad_norm": 0.4874444305896759, "learning_rate": 0.0002, "epoch": 2.6649076517150396, "step": 2020}, {"loss": 0.9612, "grad_norm": 0.5850239396095276, "learning_rate": 0.0002, "epoch": 2.678100263852243, "step": 2030}, {"loss": 0.9808, "grad_norm": 0.6495311856269836, "learning_rate": 0.0002, "epoch": 2.691292875989446, "step": 2040}, {"loss": 1.0187, "grad_norm": 1.002830147743225, "learning_rate": 0.0002, "epoch": 2.7044854881266494, "step": 2050}, {"loss": 1.0386, "grad_norm": 0.49076753854751587, "learning_rate": 0.0002, "epoch": 2.717678100263852, "step": 2060}, {"loss": 0.9416, "grad_norm": 0.4736326336860657, "learning_rate": 0.0002, "epoch": 2.7308707124010554, "step": 2070}, {"loss": 0.8424, "grad_norm": 0.5527601838111877, "learning_rate": 0.0002, "epoch": 2.7440633245382586, "step": 2080}, {"loss": 0.9149, "grad_norm": 0.7295718193054199, "learning_rate": 0.0002, "epoch": 2.757255936675462, "step": 2090}, {"loss": 0.9032, "grad_norm": 0.5437536835670471, "learning_rate": 0.0002, "epoch": 2.7704485488126647, "step": 2100}, {"loss": 0.8254, "grad_norm": 0.5997128486633301, "learning_rate": 0.0002, "epoch": 2.783641160949868, "step": 2110}, {"loss": 0.976, "grad_norm": 0.6498191356658936, "learning_rate": 0.0002, "epoch": 2.796833773087071, "step": 2120}, {"loss": 0.9543, "grad_norm": 0.5237268805503845, "learning_rate": 0.0002, "epoch": 2.8100263852242744, "step": 2130}, {"loss": 0.9302, "grad_norm": 0.6033027172088623, "learning_rate": 0.0002, "epoch": 2.8232189973614776, "step": 2140}, {"loss": 0.9625, "grad_norm": 0.6077138781547546, "learning_rate": 0.0002, "epoch": 2.836411609498681, "step": 2150}, {"loss": 0.9347, "grad_norm": 0.4127797484397888, "learning_rate": 0.0002, "epoch": 2.849604221635884, "step": 2160}, {"loss": 1.0459, "grad_norm": 0.8448635339736938, "learning_rate": 0.0002, "epoch": 2.862796833773087, "step": 2170}, {"loss": 0.8185, "grad_norm": 0.5669729113578796, "learning_rate": 0.0002, "epoch": 2.87598944591029, "step": 2180}, {"loss": 0.8555, "grad_norm": 0.510231077671051, "learning_rate": 0.0002, "epoch": 2.8891820580474934, "step": 2190}, {"loss": 0.9267, "grad_norm": 0.8072245121002197, "learning_rate": 0.0002, "epoch": 2.9023746701846966, "step": 2200}, {"loss": 0.9685, "grad_norm": 0.6055923104286194, "learning_rate": 0.0002, "epoch": 2.9155672823219, "step": 2210}, {"loss": 0.9157, "grad_norm": 0.7384416460990906, "learning_rate": 0.0002, "epoch": 2.9287598944591027, "step": 2220}, {"loss": 0.9712, "grad_norm": 0.4922751784324646, "learning_rate": 0.0002, "epoch": 2.941952506596306, "step": 2230}, {"loss": 0.9487, "grad_norm": 0.6039906740188599, "learning_rate": 0.0002, "epoch": 2.955145118733509, "step": 2240}, {"loss": 0.9776, "grad_norm": 0.4751701354980469, "learning_rate": 0.0002, "epoch": 2.9683377308707124, "step": 2250}, {"loss": 1.0619, "grad_norm": 0.5698353052139282, "learning_rate": 0.0002, "epoch": 2.9815303430079156, "step": 2260}, {"loss": 1.1184, "grad_norm": 0.893563449382782, "learning_rate": 0.0002, "epoch": 2.994722955145119, "step": 2270}, {"eval_loss": 1.2046419382095337, "eval_runtime": 71.5992, "eval_samples_per_second": 6.02, "eval_steps_per_second": 0.754, "epoch": 3.0, "step": 2274}, {"loss": 0.8269, "grad_norm": 0.41119325160980225, "learning_rate": 0.0002, "epoch": 3.007915567282322, "step": 2280}, {"loss": 0.7856, "grad_norm": 0.8169420957565308, "learning_rate": 0.0002, "epoch": 3.021108179419525, "step": 2290}, {"loss": 0.794, "grad_norm": 0.6033818125724792, "learning_rate": 0.0002, "epoch": 3.034300791556728, "step": 2300}, {"loss": 0.7607, "grad_norm": 0.9600058197975159, "learning_rate": 0.0002, "epoch": 3.0474934036939314, "step": 2310}, {"loss": 0.8353, "grad_norm": 0.5859250426292419, "learning_rate": 0.0002, "epoch": 3.0606860158311346, "step": 2320}, {"loss": 0.7598, "grad_norm": 0.6758618950843811, "learning_rate": 0.0002, "epoch": 3.073878627968338, "step": 2330}, {"loss": 0.7631, "grad_norm": 0.8407140970230103, "learning_rate": 0.0002, "epoch": 3.0870712401055407, "step": 2340}, {"loss": 0.7664, "grad_norm": 0.767779529094696, "learning_rate": 0.0002, "epoch": 3.100263852242744, "step": 2350}, {"loss": 0.7121, "grad_norm": 0.5572896599769592, "learning_rate": 0.0002, "epoch": 3.113456464379947, "step": 2360}, {"loss": 0.7419, "grad_norm": 0.5908368825912476, "learning_rate": 0.0002, "epoch": 3.1266490765171504, "step": 2370}, {"loss": 0.8024, "grad_norm": 0.8047826290130615, "learning_rate": 0.0002, "epoch": 3.1398416886543536, "step": 2380}, {"loss": 0.8686, "grad_norm": 0.8041718006134033, "learning_rate": 0.0002, "epoch": 3.153034300791557, "step": 2390}, {"loss": 0.668, "grad_norm": 0.57078617811203, "learning_rate": 0.0002, "epoch": 3.16622691292876, "step": 2400}, {"loss": 0.7976, "grad_norm": 0.5125322937965393, "learning_rate": 0.0002, "epoch": 3.179419525065963, "step": 2410}, {"loss": 0.741, "grad_norm": 0.6356934309005737, "learning_rate": 0.0002, "epoch": 3.192612137203166, "step": 2420}, {"loss": 0.687, "grad_norm": 1.0129680633544922, "learning_rate": 0.0002, "epoch": 3.2058047493403694, "step": 2430}, {"loss": 0.8316, "grad_norm": 0.8104226589202881, "learning_rate": 0.0002, "epoch": 3.2189973614775726, "step": 2440}, {"loss": 0.8343, "grad_norm": 0.7276079058647156, "learning_rate": 0.0002, "epoch": 3.232189973614776, "step": 2450}, {"loss": 0.8183, "grad_norm": 0.9753884077072144, "learning_rate": 0.0002, "epoch": 3.2453825857519787, "step": 2460}, {"loss": 0.7776, "grad_norm": 0.9753183722496033, "learning_rate": 0.0002, "epoch": 3.258575197889182, "step": 2470}, {"loss": 0.8815, "grad_norm": 0.6791225075721741, "learning_rate": 0.0002, "epoch": 3.271767810026385, "step": 2480}, {"loss": 0.7548, "grad_norm": 0.6797150373458862, "learning_rate": 0.0002, "epoch": 3.2849604221635884, "step": 2490}, {"loss": 0.8395, "grad_norm": 0.8107194900512695, "learning_rate": 0.0002, "epoch": 3.2981530343007917, "step": 2500}, {"loss": 0.7869, "grad_norm": 0.5878375172615051, "learning_rate": 0.0002, "epoch": 3.311345646437995, "step": 2510}, {"loss": 0.7992, "grad_norm": 0.5882975459098816, "learning_rate": 0.0002, "epoch": 3.324538258575198, "step": 2520}, {"loss": 0.7472, "grad_norm": 0.6180013418197632, "learning_rate": 0.0002, "epoch": 3.337730870712401, "step": 2530}, {"loss": 0.8033, "grad_norm": 1.0008151531219482, "learning_rate": 0.0002, "epoch": 3.350923482849604, "step": 2540}, {"loss": 0.8464, "grad_norm": 0.6404656767845154, "learning_rate": 0.0002, "epoch": 3.3641160949868074, "step": 2550}, {"loss": 0.7533, "grad_norm": 0.8481354117393494, "learning_rate": 0.0002, "epoch": 3.3773087071240107, "step": 2560}, {"loss": 0.7852, "grad_norm": 0.8068035244941711, "learning_rate": 0.0002, "epoch": 3.390501319261214, "step": 2570}, {"loss": 0.8621, "grad_norm": 0.7477166056632996, "learning_rate": 0.0002, "epoch": 3.4036939313984167, "step": 2580}, {"loss": 0.8352, "grad_norm": 0.6202635765075684, "learning_rate": 0.0002, "epoch": 3.41688654353562, "step": 2590}, {"loss": 0.7572, "grad_norm": 0.6981159448623657, "learning_rate": 0.0002, "epoch": 3.430079155672823, "step": 2600}, {"loss": 0.7846, "grad_norm": 0.6611084342002869, "learning_rate": 0.0002, "epoch": 3.4432717678100264, "step": 2610}, {"loss": 0.7503, "grad_norm": 0.5727696418762207, "learning_rate": 0.0002, "epoch": 3.4564643799472297, "step": 2620}, {"loss": 0.8427, "grad_norm": 1.2354545593261719, "learning_rate": 0.0002, "epoch": 3.469656992084433, "step": 2630}, {"loss": 0.7747, "grad_norm": 0.6347638368606567, "learning_rate": 0.0002, "epoch": 3.4828496042216357, "step": 2640}, {"loss": 0.8426, "grad_norm": 0.6975704431533813, "learning_rate": 0.0002, "epoch": 3.496042216358839, "step": 2650}, {"loss": 0.8773, "grad_norm": 0.6569573879241943, "learning_rate": 0.0002, "epoch": 3.509234828496042, "step": 2660}, {"loss": 0.7908, "grad_norm": 0.6979609131813049, "learning_rate": 0.0002, "epoch": 3.5224274406332454, "step": 2670}, {"loss": 0.8254, "grad_norm": 0.6287988424301147, "learning_rate": 0.0002, "epoch": 3.5356200527704487, "step": 2680}, {"loss": 0.7815, "grad_norm": 0.8682637214660645, "learning_rate": 0.0002, "epoch": 3.5488126649076515, "step": 2690}, {"loss": 0.7566, "grad_norm": 0.7062831521034241, "learning_rate": 0.0002, "epoch": 3.5620052770448547, "step": 2700}, {"loss": 0.713, "grad_norm": 1.0061452388763428, "learning_rate": 0.0002, "epoch": 3.575197889182058, "step": 2710}, {"loss": 0.7738, "grad_norm": 0.719097375869751, "learning_rate": 0.0002, "epoch": 3.588390501319261, "step": 2720}, {"loss": 0.8145, "grad_norm": 0.7583496570587158, "learning_rate": 0.0002, "epoch": 3.6015831134564644, "step": 2730}, {"loss": 0.91, "grad_norm": 0.7543531060218811, "learning_rate": 0.0002, "epoch": 3.6147757255936677, "step": 2740}, {"loss": 0.8325, "grad_norm": 0.8873646855354309, "learning_rate": 0.0002, "epoch": 3.627968337730871, "step": 2750}, {"loss": 0.7116, "grad_norm": 1.0657562017440796, "learning_rate": 0.0002, "epoch": 3.641160949868074, "step": 2760}, {"loss": 0.8291, "grad_norm": 0.8641113638877869, "learning_rate": 0.0002, "epoch": 3.654353562005277, "step": 2770}, {"loss": 0.8302, "grad_norm": 0.6620645523071289, "learning_rate": 0.0002, "epoch": 3.66754617414248, "step": 2780}, {"loss": 0.8261, "grad_norm": 0.6919541954994202, "learning_rate": 0.0002, "epoch": 3.6807387862796834, "step": 2790}, {"loss": 0.8388, "grad_norm": 0.7305743098258972, "learning_rate": 0.0002, "epoch": 3.6939313984168867, "step": 2800}, {"loss": 0.8053, "grad_norm": 0.7464777827262878, "learning_rate": 0.0002, "epoch": 3.7071240105540895, "step": 2810}, {"loss": 0.8019, "grad_norm": 0.8067063093185425, "learning_rate": 0.0002, "epoch": 3.7203166226912927, "step": 2820}, {"loss": 0.8259, "grad_norm": 0.7789416313171387, "learning_rate": 0.0002, "epoch": 3.733509234828496, "step": 2830}, {"loss": 0.774, "grad_norm": 0.507529079914093, "learning_rate": 0.0002, "epoch": 3.746701846965699, "step": 2840}, {"loss": 0.832, "grad_norm": 0.6509260535240173, "learning_rate": 0.0002, "epoch": 3.7598944591029024, "step": 2850}, {"loss": 0.8257, "grad_norm": 0.9141367673873901, "learning_rate": 0.0002, "epoch": 3.7730870712401057, "step": 2860}, {"loss": 0.9436, "grad_norm": 0.7852635979652405, "learning_rate": 0.0002, "epoch": 3.786279683377309, "step": 2870}, {"loss": 0.8842, "grad_norm": 0.5340318083763123, "learning_rate": 0.0002, "epoch": 3.7994722955145117, "step": 2880}, {"loss": 0.7468, "grad_norm": 0.6246042847633362, "learning_rate": 0.0002, "epoch": 3.812664907651715, "step": 2890}, {"loss": 0.8184, "grad_norm": 0.7064066529273987, "learning_rate": 0.0002, "epoch": 3.825857519788918, "step": 2900}, {"loss": 0.8515, "grad_norm": 0.6144065856933594, "learning_rate": 0.0002, "epoch": 3.8390501319261214, "step": 2910}, {"loss": 0.7484, "grad_norm": 0.5268424153327942, "learning_rate": 0.0002, "epoch": 3.8522427440633247, "step": 2920}, {"loss": 0.7594, "grad_norm": 0.9508116841316223, "learning_rate": 0.0002, "epoch": 3.8654353562005275, "step": 2930}, {"loss": 0.8437, "grad_norm": 0.9133715629577637, "learning_rate": 0.0002, "epoch": 3.8786279683377307, "step": 2940}, {"loss": 0.8611, "grad_norm": 1.0144646167755127, "learning_rate": 0.0002, "epoch": 3.891820580474934, "step": 2950}, {"loss": 0.8043, "grad_norm": 0.6397877931594849, "learning_rate": 0.0002, "epoch": 3.905013192612137, "step": 2960}, {"loss": 0.8285, "grad_norm": 0.734835147857666, "learning_rate": 0.0002, "epoch": 3.9182058047493404, "step": 2970}, {"loss": 0.7831, "grad_norm": 0.784853994846344, "learning_rate": 0.0002, "epoch": 3.9313984168865437, "step": 2980}, {"loss": 0.8148, "grad_norm": 0.805831789970398, "learning_rate": 0.0002, "epoch": 3.944591029023747, "step": 2990}, {"loss": 0.8252, "grad_norm": 0.6299595236778259, "learning_rate": 0.0002, "epoch": 3.9577836411609497, "step": 3000}, {"loss": 0.8244, "grad_norm": 0.6264058351516724, "learning_rate": 0.0002, "epoch": 3.970976253298153, "step": 3010}, {"loss": 0.8185, "grad_norm": 0.6419739723205566, "learning_rate": 0.0002, "epoch": 3.984168865435356, "step": 3020}, {"loss": 0.8174, "grad_norm": 0.7737036943435669, "learning_rate": 0.0002, "epoch": 3.9973614775725594, "step": 3030}, {"eval_loss": 1.2454297542572021, "eval_runtime": 71.8558, "eval_samples_per_second": 5.998, "eval_steps_per_second": 0.752, "epoch": 4.0, "step": 3032}, {"loss": 0.6716, "grad_norm": 1.092727541923523, "learning_rate": 0.0002, "epoch": 4.010554089709762, "step": 3040}, {"loss": 0.596, "grad_norm": 0.8087759613990784, "learning_rate": 0.0002, "epoch": 4.0237467018469655, "step": 3050}, {"loss": 0.7055, "grad_norm": 0.8106053471565247, "learning_rate": 0.0002, "epoch": 4.036939313984169, "step": 3060}, {"loss": 0.6846, "grad_norm": 0.8675326704978943, "learning_rate": 0.0002, "epoch": 4.050131926121372, "step": 3070}, {"loss": 0.6064, "grad_norm": 0.9620490074157715, "learning_rate": 0.0002, "epoch": 4.063324538258575, "step": 3080}, {"loss": 0.6047, "grad_norm": 0.8996296525001526, "learning_rate": 0.0002, "epoch": 4.076517150395778, "step": 3090}, {"loss": 0.6111, "grad_norm": 0.8648998737335205, "learning_rate": 0.0002, "epoch": 4.089709762532982, "step": 3100}, {"loss": 0.5853, "grad_norm": 1.0321335792541504, "learning_rate": 0.0002, "epoch": 4.102902374670185, "step": 3110}, {"loss": 0.6161, "grad_norm": 0.7949225306510925, "learning_rate": 0.0002, "epoch": 4.116094986807388, "step": 3120}, {"loss": 0.6354, "grad_norm": 0.9684646129608154, "learning_rate": 0.0002, "epoch": 4.129287598944591, "step": 3130}, {"loss": 0.6198, "grad_norm": 0.8698066473007202, "learning_rate": 0.0002, "epoch": 4.142480211081795, "step": 3140}, {"loss": 0.7185, "grad_norm": 0.7688450813293457, "learning_rate": 0.0002, "epoch": 4.155672823218997, "step": 3150}, {"loss": 0.6053, "grad_norm": 0.9682092070579529, "learning_rate": 0.0002, "epoch": 4.1688654353562, "step": 3160}, {"loss": 0.6827, "grad_norm": 0.961561918258667, "learning_rate": 0.0002, "epoch": 4.1820580474934035, "step": 3170}, {"loss": 0.6403, "grad_norm": 1.3962990045547485, "learning_rate": 0.0002, "epoch": 4.195250659630607, "step": 3180}, {"loss": 0.6319, "grad_norm": 0.9485045075416565, "learning_rate": 0.0002, "epoch": 4.20844327176781, "step": 3190}, {"loss": 0.5908, "grad_norm": 0.7768281698226929, "learning_rate": 0.0002, "epoch": 4.221635883905013, "step": 3200}, {"loss": 0.6365, "grad_norm": 1.2685691118240356, "learning_rate": 0.0002, "epoch": 4.2348284960422165, "step": 3210}, {"loss": 0.6601, "grad_norm": 0.6876471638679504, "learning_rate": 0.0002, "epoch": 4.24802110817942, "step": 3220}, {"loss": 0.6274, "grad_norm": 1.0074554681777954, "learning_rate": 0.0002, "epoch": 4.261213720316623, "step": 3230}, {"loss": 0.6027, "grad_norm": 0.8094777464866638, "learning_rate": 0.0002, "epoch": 4.274406332453826, "step": 3240}, {"loss": 0.643, "grad_norm": 0.7906569242477417, "learning_rate": 0.0002, "epoch": 4.287598944591029, "step": 3250}, {"loss": 0.5909, "grad_norm": 0.840238630771637, "learning_rate": 0.0002, "epoch": 4.300791556728232, "step": 3260}, {"loss": 0.5943, "grad_norm": 1.0119295120239258, "learning_rate": 0.0002, "epoch": 4.313984168865435, "step": 3270}, {"loss": 0.5912, "grad_norm": 0.7943191528320312, "learning_rate": 0.0002, "epoch": 4.327176781002638, "step": 3280}, {"loss": 0.6235, "grad_norm": 0.7691723704338074, "learning_rate": 0.0002, "epoch": 4.3403693931398415, "step": 3290}, {"loss": 0.6173, "grad_norm": 0.7227770686149597, "learning_rate": 0.0002, "epoch": 4.353562005277045, "step": 3300}, {"loss": 0.6047, "grad_norm": 0.8512253165245056, "learning_rate": 0.0002, "epoch": 4.366754617414248, "step": 3310}, {"loss": 0.5849, "grad_norm": 0.7852529287338257, "learning_rate": 0.0002, "epoch": 4.379947229551451, "step": 3320}, {"loss": 0.6416, "grad_norm": 0.8888797163963318, "learning_rate": 0.0002, "epoch": 4.3931398416886545, "step": 3330}, {"loss": 0.6804, "grad_norm": 0.9522430896759033, "learning_rate": 0.0002, "epoch": 4.406332453825858, "step": 3340}, {"loss": 0.6345, "grad_norm": 0.900276780128479, "learning_rate": 0.0002, "epoch": 4.419525065963061, "step": 3350}, {"loss": 0.7055, "grad_norm": 1.181547999382019, "learning_rate": 0.0002, "epoch": 4.432717678100264, "step": 3360}, {"loss": 0.7073, "grad_norm": 0.903142511844635, "learning_rate": 0.0002, "epoch": 4.445910290237467, "step": 3370}, {"loss": 0.7235, "grad_norm": 0.8747565150260925, "learning_rate": 0.0002, "epoch": 4.45910290237467, "step": 3380}, {"loss": 0.7071, "grad_norm": 0.7838051319122314, "learning_rate": 0.0002, "epoch": 4.472295514511873, "step": 3390}, {"loss": 0.5932, "grad_norm": 0.8691313862800598, "learning_rate": 0.0002, "epoch": 4.485488126649076, "step": 3400}, {"loss": 0.7019, "grad_norm": 0.8493868708610535, "learning_rate": 0.0002, "epoch": 4.4986807387862795, "step": 3410}, {"loss": 0.5959, "grad_norm": 1.0104830265045166, "learning_rate": 0.0002, "epoch": 4.511873350923483, "step": 3420}, {"loss": 0.6662, "grad_norm": 1.1716967821121216, "learning_rate": 0.0002, "epoch": 4.525065963060686, "step": 3430}, {"loss": 0.6411, "grad_norm": 0.9122593998908997, "learning_rate": 0.0002, "epoch": 4.538258575197889, "step": 3440}, {"loss": 0.7047, "grad_norm": 0.829090416431427, "learning_rate": 0.0002, "epoch": 4.5514511873350925, "step": 3450}, {"loss": 0.6001, "grad_norm": 1.141662836074829, "learning_rate": 0.0002, "epoch": 4.564643799472296, "step": 3460}, {"loss": 0.6612, "grad_norm": 0.8423182368278503, "learning_rate": 0.0002, "epoch": 4.577836411609499, "step": 3470}, {"loss": 0.6797, "grad_norm": 0.8024184703826904, "learning_rate": 0.0002, "epoch": 4.591029023746702, "step": 3480}, {"loss": 0.7184, "grad_norm": 0.7703381776809692, "learning_rate": 0.0002, "epoch": 4.6042216358839045, "step": 3490}, {"loss": 0.7001, "grad_norm": 0.9883959293365479, "learning_rate": 0.0002, "epoch": 4.617414248021108, "step": 3500}, {"loss": 0.6188, "grad_norm": 0.9554709196090698, "learning_rate": 0.0002, "epoch": 4.630606860158311, "step": 3510}, {"loss": 0.7378, "grad_norm": 1.9949709177017212, "learning_rate": 0.0002, "epoch": 4.643799472295514, "step": 3520}, {"loss": 0.6678, "grad_norm": 0.7762255072593689, "learning_rate": 0.0002, "epoch": 4.6569920844327175, "step": 3530}, {"loss": 0.6298, "grad_norm": 0.9538425803184509, "learning_rate": 0.0002, "epoch": 4.670184696569921, "step": 3540}, {"loss": 0.6352, "grad_norm": 1.0279661417007446, "learning_rate": 0.0002, "epoch": 4.683377308707124, "step": 3550}, {"loss": 0.6641, "grad_norm": 0.7545472979545593, "learning_rate": 0.0002, "epoch": 4.696569920844327, "step": 3560}, {"loss": 0.6887, "grad_norm": 0.8919376730918884, "learning_rate": 0.0002, "epoch": 4.7097625329815305, "step": 3570}, {"loss": 0.6395, "grad_norm": 0.7621569633483887, "learning_rate": 0.0002, "epoch": 4.722955145118734, "step": 3580}, {"loss": 0.6928, "grad_norm": 1.205320119857788, "learning_rate": 0.0002, "epoch": 4.736147757255937, "step": 3590}, {"loss": 0.6612, "grad_norm": 1.0642725229263306, "learning_rate": 0.0002, "epoch": 4.74934036939314, "step": 3600}, {"loss": 0.6541, "grad_norm": 0.9402666687965393, "learning_rate": 0.0002, "epoch": 4.762532981530343, "step": 3610}, {"loss": 0.6395, "grad_norm": 1.254127025604248, "learning_rate": 0.0002, "epoch": 4.775725593667546, "step": 3620}, {"loss": 0.692, "grad_norm": 0.7609598636627197, "learning_rate": 0.0002, "epoch": 4.788918205804749, "step": 3630}, {"loss": 0.6578, "grad_norm": 0.8240329623222351, "learning_rate": 0.0002, "epoch": 4.802110817941952, "step": 3640}, {"loss": 0.7383, "grad_norm": 0.8356260657310486, "learning_rate": 0.0002, "epoch": 4.8153034300791555, "step": 3650}, {"loss": 0.6368, "grad_norm": 0.9130708575248718, "learning_rate": 0.0002, "epoch": 4.828496042216359, "step": 3660}, {"loss": 0.7269, "grad_norm": 0.9384765028953552, "learning_rate": 0.0002, "epoch": 4.841688654353562, "step": 3670}, {"loss": 0.6509, "grad_norm": 0.9829966425895691, "learning_rate": 0.0002, "epoch": 4.854881266490765, "step": 3680}, {"loss": 0.6311, "grad_norm": 1.0488632917404175, "learning_rate": 0.0002, "epoch": 4.8680738786279685, "step": 3690}, {"loss": 0.7005, "grad_norm": 1.2278969287872314, "learning_rate": 0.0002, "epoch": 4.881266490765172, "step": 3700}, {"loss": 0.6869, "grad_norm": 0.8078970313072205, "learning_rate": 0.0002, "epoch": 4.894459102902375, "step": 3710}, {"loss": 0.6588, "grad_norm": 0.8081700205802917, "learning_rate": 0.0002, "epoch": 4.907651715039578, "step": 3720}, {"loss": 0.7189, "grad_norm": 0.9204511046409607, "learning_rate": 0.0002, "epoch": 4.9208443271767806, "step": 3730}, {"loss": 0.6953, "grad_norm": 0.9326391220092773, "learning_rate": 0.0002, "epoch": 4.934036939313984, "step": 3740}, {"loss": 0.68, "grad_norm": 1.0089969635009766, "learning_rate": 0.0002, "epoch": 4.947229551451187, "step": 3750}, {"loss": 0.7031, "grad_norm": 0.7063466906547546, "learning_rate": 0.0002, "epoch": 4.96042216358839, "step": 3760}, {"loss": 0.6568, "grad_norm": 1.2603905200958252, "learning_rate": 0.0002, "epoch": 4.9736147757255935, "step": 3770}, {"loss": 0.7134, "grad_norm": 0.8418653607368469, "learning_rate": 0.0002, "epoch": 4.986807387862797, "step": 3780}, {"loss": 0.6683, "grad_norm": 0.9537181854248047, "learning_rate": 0.0002, "epoch": 5.0, "step": 3790}]} +{"epoch": 6.0, "step": 4548, "epoch_duration": 2068.02756690979, "total_accumulated_duration": 13157.855608463287, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9466, "grad_norm": 0.7545632123947144, "learning_rate": 0.0002, "epoch": 0.013192612137203167, "step": 10}, {"loss": 1.4909, "grad_norm": 0.5787661075592041, "learning_rate": 0.0002, "epoch": 0.026385224274406333, "step": 20}, {"loss": 1.3906, "grad_norm": 0.8616093993186951, "learning_rate": 0.0002, "epoch": 0.0395778364116095, "step": 30}, {"loss": 1.4116, "grad_norm": 0.42088547348976135, "learning_rate": 0.0002, "epoch": 0.052770448548812667, "step": 40}, {"loss": 1.3446, "grad_norm": 0.47704678773880005, "learning_rate": 0.0002, "epoch": 0.06596306068601583, "step": 50}, {"loss": 1.2476, "grad_norm": 0.5763994455337524, "learning_rate": 0.0002, "epoch": 0.079155672823219, "step": 60}, {"loss": 1.2268, "grad_norm": 0.4579846262931824, "learning_rate": 0.0002, "epoch": 0.09234828496042216, "step": 70}, {"loss": 1.415, "grad_norm": 0.46623846888542175, "learning_rate": 0.0002, "epoch": 0.10554089709762533, "step": 80}, {"loss": 1.2849, "grad_norm": 0.4206956624984741, "learning_rate": 0.0002, "epoch": 0.11873350923482849, "step": 90}, {"loss": 1.1608, "grad_norm": 0.41896629333496094, "learning_rate": 0.0002, "epoch": 0.13192612137203166, "step": 100}, {"loss": 1.2617, "grad_norm": 0.3459089398384094, "learning_rate": 0.0002, "epoch": 0.14511873350923482, "step": 110}, {"loss": 1.2858, "grad_norm": 0.4587327837944031, "learning_rate": 0.0002, "epoch": 0.158311345646438, "step": 120}, {"loss": 1.2241, "grad_norm": 0.433525413274765, "learning_rate": 0.0002, "epoch": 0.17150395778364116, "step": 130}, {"loss": 1.2269, "grad_norm": 0.39253175258636475, "learning_rate": 0.0002, "epoch": 0.18469656992084432, "step": 140}, {"loss": 1.2224, "grad_norm": 0.3602290749549866, "learning_rate": 0.0002, "epoch": 0.19788918205804748, "step": 150}, {"loss": 1.2834, "grad_norm": 0.41160839796066284, "learning_rate": 0.0002, "epoch": 0.21108179419525067, "step": 160}, {"loss": 1.1986, "grad_norm": 0.7213630080223083, "learning_rate": 0.0002, "epoch": 0.22427440633245382, "step": 170}, {"loss": 1.2215, "grad_norm": 0.39086055755615234, "learning_rate": 0.0002, "epoch": 0.23746701846965698, "step": 180}, {"loss": 1.3315, "grad_norm": 0.4465520977973938, "learning_rate": 0.0002, "epoch": 0.25065963060686014, "step": 190}, {"loss": 1.3798, "grad_norm": 1.814679741859436, "learning_rate": 0.0002, "epoch": 0.2638522427440633, "step": 200}, {"loss": 1.2259, "grad_norm": 0.5026423931121826, "learning_rate": 0.0002, "epoch": 0.2770448548812665, "step": 210}, {"loss": 1.3306, "grad_norm": 0.4156292974948883, "learning_rate": 0.0002, "epoch": 0.29023746701846964, "step": 220}, {"loss": 1.266, "grad_norm": 0.40813493728637695, "learning_rate": 0.0002, "epoch": 0.3034300791556728, "step": 230}, {"loss": 1.1533, "grad_norm": 0.3304787874221802, "learning_rate": 0.0002, "epoch": 0.316622691292876, "step": 240}, {"loss": 1.3154, "grad_norm": 0.46139976382255554, "learning_rate": 0.0002, "epoch": 0.32981530343007914, "step": 250}, {"loss": 1.1365, "grad_norm": 0.37518271803855896, "learning_rate": 0.0002, "epoch": 0.34300791556728233, "step": 260}, {"loss": 1.2004, "grad_norm": 0.35586467385292053, "learning_rate": 0.0002, "epoch": 0.3562005277044855, "step": 270}, {"loss": 1.2609, "grad_norm": 0.32441186904907227, "learning_rate": 0.0002, "epoch": 0.36939313984168864, "step": 280}, {"loss": 1.212, "grad_norm": 0.3198683261871338, "learning_rate": 0.0002, "epoch": 0.38258575197889183, "step": 290}, {"loss": 1.2465, "grad_norm": 0.33663108944892883, "learning_rate": 0.0002, "epoch": 0.39577836411609496, "step": 300}, {"loss": 1.1702, "grad_norm": 0.3711244761943817, "learning_rate": 0.0002, "epoch": 0.40897097625329815, "step": 310}, {"loss": 1.0871, "grad_norm": 0.3209651708602905, "learning_rate": 0.0002, "epoch": 0.42216358839050133, "step": 320}, {"loss": 1.2728, "grad_norm": 0.5152716040611267, "learning_rate": 0.0002, "epoch": 0.43535620052770446, "step": 330}, {"loss": 1.1833, "grad_norm": 0.5431376695632935, "learning_rate": 0.0002, "epoch": 0.44854881266490765, "step": 340}, {"loss": 1.1344, "grad_norm": 0.3069997727870941, "learning_rate": 0.0002, "epoch": 0.46174142480211083, "step": 350}, {"loss": 1.2742, "grad_norm": 0.34260064363479614, "learning_rate": 0.0002, "epoch": 0.47493403693931396, "step": 360}, {"loss": 1.3161, "grad_norm": 0.345653235912323, "learning_rate": 0.0002, "epoch": 0.48812664907651715, "step": 370}, {"loss": 1.1552, "grad_norm": 0.46222734451293945, "learning_rate": 0.0002, "epoch": 0.5013192612137203, "step": 380}, {"loss": 1.0616, "grad_norm": 0.27301734685897827, "learning_rate": 0.0002, "epoch": 0.5145118733509235, "step": 390}, {"loss": 1.1944, "grad_norm": 0.29048439860343933, "learning_rate": 0.0002, "epoch": 0.5277044854881267, "step": 400}, {"loss": 1.0956, "grad_norm": 0.32927802205085754, "learning_rate": 0.0002, "epoch": 0.5408970976253298, "step": 410}, {"loss": 1.2362, "grad_norm": 0.3336397409439087, "learning_rate": 0.0002, "epoch": 0.554089709762533, "step": 420}, {"loss": 1.1445, "grad_norm": 0.4007597267627716, "learning_rate": 0.0002, "epoch": 0.5672823218997362, "step": 430}, {"loss": 1.2731, "grad_norm": 0.36144956946372986, "learning_rate": 0.0002, "epoch": 0.5804749340369393, "step": 440}, {"loss": 1.1604, "grad_norm": 0.6331009864807129, "learning_rate": 0.0002, "epoch": 0.5936675461741425, "step": 450}, {"loss": 1.1692, "grad_norm": 0.41469088196754456, "learning_rate": 0.0002, "epoch": 0.6068601583113457, "step": 460}, {"loss": 1.2372, "grad_norm": 0.4388185143470764, "learning_rate": 0.0002, "epoch": 0.6200527704485488, "step": 470}, {"loss": 1.2541, "grad_norm": 0.3738141655921936, "learning_rate": 0.0002, "epoch": 0.633245382585752, "step": 480}, {"loss": 1.1265, "grad_norm": 0.7212023138999939, "learning_rate": 0.0002, "epoch": 0.6464379947229552, "step": 490}, {"loss": 1.2786, "grad_norm": 0.2972351014614105, "learning_rate": 0.0002, "epoch": 0.6596306068601583, "step": 500}, {"loss": 1.1739, "grad_norm": 0.45293179154396057, "learning_rate": 0.0002, "epoch": 0.6728232189973615, "step": 510}, {"loss": 1.255, "grad_norm": 0.4319860637187958, "learning_rate": 0.0002, "epoch": 0.6860158311345647, "step": 520}, {"loss": 1.0959, "grad_norm": 0.3050215542316437, "learning_rate": 0.0002, "epoch": 0.6992084432717678, "step": 530}, {"loss": 1.1608, "grad_norm": 0.3552611172199249, "learning_rate": 0.0002, "epoch": 0.712401055408971, "step": 540}, {"loss": 1.2546, "grad_norm": 0.3631151020526886, "learning_rate": 0.0002, "epoch": 0.7255936675461742, "step": 550}, {"loss": 1.1371, "grad_norm": 0.28177931904792786, "learning_rate": 0.0002, "epoch": 0.7387862796833773, "step": 560}, {"loss": 1.2081, "grad_norm": 0.359764039516449, "learning_rate": 0.0002, "epoch": 0.7519788918205804, "step": 570}, {"loss": 1.1356, "grad_norm": 0.3970327377319336, "learning_rate": 0.0002, "epoch": 0.7651715039577837, "step": 580}, {"loss": 1.309, "grad_norm": 0.3541001081466675, "learning_rate": 0.0002, "epoch": 0.7783641160949868, "step": 590}, {"loss": 1.2061, "grad_norm": 0.3478573262691498, "learning_rate": 0.0002, "epoch": 0.7915567282321899, "step": 600}, {"loss": 1.1864, "grad_norm": 0.3900321424007416, "learning_rate": 0.0002, "epoch": 0.8047493403693932, "step": 610}, {"loss": 1.1358, "grad_norm": 0.3443238437175751, "learning_rate": 0.0002, "epoch": 0.8179419525065963, "step": 620}, {"loss": 1.3232, "grad_norm": 0.44238781929016113, "learning_rate": 0.0002, "epoch": 0.8311345646437994, "step": 630}, {"loss": 1.1247, "grad_norm": 0.36339467763900757, "learning_rate": 0.0002, "epoch": 0.8443271767810027, "step": 640}, {"loss": 1.1049, "grad_norm": 0.6243070363998413, "learning_rate": 0.0002, "epoch": 0.8575197889182058, "step": 650}, {"loss": 1.1943, "grad_norm": 0.3209173381328583, "learning_rate": 0.0002, "epoch": 0.8707124010554089, "step": 660}, {"loss": 1.1927, "grad_norm": 0.35017991065979004, "learning_rate": 0.0002, "epoch": 0.8839050131926122, "step": 670}, {"loss": 1.1603, "grad_norm": 0.3247159421443939, "learning_rate": 0.0002, "epoch": 0.8970976253298153, "step": 680}, {"loss": 1.2298, "grad_norm": 0.4091894030570984, "learning_rate": 0.0002, "epoch": 0.9102902374670184, "step": 690}, {"loss": 1.2756, "grad_norm": 0.3975585997104645, "learning_rate": 0.0002, "epoch": 0.9234828496042217, "step": 700}, {"loss": 1.281, "grad_norm": 0.3666245937347412, "learning_rate": 0.0002, "epoch": 0.9366754617414248, "step": 710}, {"loss": 1.2855, "grad_norm": 0.45216917991638184, "learning_rate": 0.0002, "epoch": 0.9498680738786279, "step": 720}, {"loss": 1.2217, "grad_norm": 0.36108118295669556, "learning_rate": 0.0002, "epoch": 0.9630606860158312, "step": 730}, {"loss": 1.17, "grad_norm": 0.44550251960754395, "learning_rate": 0.0002, "epoch": 0.9762532981530343, "step": 740}, {"loss": 1.0672, "grad_norm": 0.29801255464553833, "learning_rate": 0.0002, "epoch": 0.9894459102902374, "step": 750}, {"eval_loss": 1.203244686126709, "eval_runtime": 76.0457, "eval_samples_per_second": 5.668, "eval_steps_per_second": 0.71, "epoch": 1.0, "step": 758}, {"loss": 1.0748, "grad_norm": 0.4096551239490509, "learning_rate": 0.0002, "epoch": 1.0026385224274406, "step": 760}, {"loss": 1.1537, "grad_norm": 0.2649582326412201, "learning_rate": 0.0002, "epoch": 1.0158311345646438, "step": 770}, {"loss": 1.1676, "grad_norm": 0.3100722134113312, "learning_rate": 0.0002, "epoch": 1.029023746701847, "step": 780}, {"loss": 1.1736, "grad_norm": 0.3911755383014679, "learning_rate": 0.0002, "epoch": 1.04221635883905, "step": 790}, {"loss": 1.0354, "grad_norm": 0.4600953757762909, "learning_rate": 0.0002, "epoch": 1.0554089709762533, "step": 800}, {"loss": 1.2869, "grad_norm": 0.28671619296073914, "learning_rate": 0.0002, "epoch": 1.0686015831134565, "step": 810}, {"loss": 1.1441, "grad_norm": 0.47282642126083374, "learning_rate": 0.0002, "epoch": 1.0817941952506596, "step": 820}, {"loss": 1.1368, "grad_norm": 0.690073549747467, "learning_rate": 0.0002, "epoch": 1.0949868073878628, "step": 830}, {"loss": 0.9944, "grad_norm": 0.7317902445793152, "learning_rate": 0.0002, "epoch": 1.108179419525066, "step": 840}, {"loss": 1.108, "grad_norm": 0.44215938448905945, "learning_rate": 0.0002, "epoch": 1.121372031662269, "step": 850}, {"loss": 1.0558, "grad_norm": 0.33875149488449097, "learning_rate": 0.0002, "epoch": 1.1345646437994723, "step": 860}, {"loss": 1.0471, "grad_norm": 0.3700002431869507, "learning_rate": 0.0002, "epoch": 1.1477572559366755, "step": 870}, {"loss": 1.0121, "grad_norm": 0.41173291206359863, "learning_rate": 0.0002, "epoch": 1.1609498680738786, "step": 880}, {"loss": 1.0312, "grad_norm": 0.5253589749336243, "learning_rate": 0.0002, "epoch": 1.1741424802110818, "step": 890}, {"loss": 1.2238, "grad_norm": 0.3912237286567688, "learning_rate": 0.0002, "epoch": 1.187335092348285, "step": 900}, {"loss": 0.9434, "grad_norm": 0.40990331768989563, "learning_rate": 0.0002, "epoch": 1.200527704485488, "step": 910}, {"loss": 1.0493, "grad_norm": 0.40377968549728394, "learning_rate": 0.0002, "epoch": 1.2137203166226913, "step": 920}, {"loss": 1.1496, "grad_norm": 0.4605846405029297, "learning_rate": 0.0002, "epoch": 1.2269129287598945, "step": 930}, {"loss": 1.1173, "grad_norm": 0.31564897298812866, "learning_rate": 0.0002, "epoch": 1.2401055408970976, "step": 940}, {"loss": 1.0547, "grad_norm": 0.39808550477027893, "learning_rate": 0.0002, "epoch": 1.2532981530343008, "step": 950}, {"loss": 1.1367, "grad_norm": 0.3762115240097046, "learning_rate": 0.0002, "epoch": 1.266490765171504, "step": 960}, {"loss": 1.1596, "grad_norm": 0.4174984097480774, "learning_rate": 0.0002, "epoch": 1.279683377308707, "step": 970}, {"loss": 1.1327, "grad_norm": 0.5263054966926575, "learning_rate": 0.0002, "epoch": 1.2928759894459103, "step": 980}, {"loss": 1.0339, "grad_norm": 0.41673699021339417, "learning_rate": 0.0002, "epoch": 1.3060686015831133, "step": 990}, {"loss": 1.1198, "grad_norm": 0.9613684415817261, "learning_rate": 0.0002, "epoch": 1.3192612137203166, "step": 1000}, {"loss": 1.0444, "grad_norm": 0.3690216839313507, "learning_rate": 0.0002, "epoch": 1.3324538258575198, "step": 1010}, {"loss": 1.0473, "grad_norm": 0.521821141242981, "learning_rate": 0.0002, "epoch": 1.345646437994723, "step": 1020}, {"loss": 1.1065, "grad_norm": 0.3353094160556793, "learning_rate": 0.0002, "epoch": 1.358839050131926, "step": 1030}, {"loss": 1.1286, "grad_norm": 0.3843843936920166, "learning_rate": 0.0002, "epoch": 1.3720316622691293, "step": 1040}, {"loss": 1.1369, "grad_norm": 0.372514545917511, "learning_rate": 0.0002, "epoch": 1.3852242744063323, "step": 1050}, {"loss": 1.0041, "grad_norm": 0.34537771344184875, "learning_rate": 0.0002, "epoch": 1.3984168865435356, "step": 1060}, {"loss": 1.0251, "grad_norm": 0.45349085330963135, "learning_rate": 0.0002, "epoch": 1.4116094986807388, "step": 1070}, {"loss": 1.1873, "grad_norm": 0.5120177268981934, "learning_rate": 0.0002, "epoch": 1.424802110817942, "step": 1080}, {"loss": 1.0872, "grad_norm": 0.42800238728523254, "learning_rate": 0.0002, "epoch": 1.437994722955145, "step": 1090}, {"loss": 1.0734, "grad_norm": 0.343832790851593, "learning_rate": 0.0002, "epoch": 1.4511873350923483, "step": 1100}, {"loss": 1.1286, "grad_norm": 0.3829841911792755, "learning_rate": 0.0002, "epoch": 1.4643799472295513, "step": 1110}, {"loss": 1.1268, "grad_norm": 0.4289931058883667, "learning_rate": 0.0002, "epoch": 1.4775725593667546, "step": 1120}, {"loss": 1.0676, "grad_norm": 0.42750850319862366, "learning_rate": 0.0002, "epoch": 1.4907651715039578, "step": 1130}, {"loss": 1.072, "grad_norm": 0.34328413009643555, "learning_rate": 0.0002, "epoch": 1.503957783641161, "step": 1140}, {"loss": 1.0863, "grad_norm": 0.349096417427063, "learning_rate": 0.0002, "epoch": 1.517150395778364, "step": 1150}, {"loss": 1.1462, "grad_norm": 0.7700717449188232, "learning_rate": 0.0002, "epoch": 1.5303430079155673, "step": 1160}, {"loss": 0.9528, "grad_norm": 0.39294949173927307, "learning_rate": 0.0002, "epoch": 1.5435356200527703, "step": 1170}, {"loss": 1.1603, "grad_norm": 0.36173608899116516, "learning_rate": 0.0002, "epoch": 1.5567282321899736, "step": 1180}, {"loss": 1.1508, "grad_norm": 0.6034277677536011, "learning_rate": 0.0002, "epoch": 1.5699208443271768, "step": 1190}, {"loss": 1.0105, "grad_norm": 0.36694103479385376, "learning_rate": 0.0002, "epoch": 1.58311345646438, "step": 1200}, {"loss": 1.1479, "grad_norm": 0.4727209508419037, "learning_rate": 0.0002, "epoch": 1.596306068601583, "step": 1210}, {"loss": 1.0689, "grad_norm": 0.6482883095741272, "learning_rate": 0.0002, "epoch": 1.6094986807387863, "step": 1220}, {"loss": 1.1405, "grad_norm": 0.5238035917282104, "learning_rate": 0.0002, "epoch": 1.6226912928759893, "step": 1230}, {"loss": 1.0596, "grad_norm": 0.4812222421169281, "learning_rate": 0.0002, "epoch": 1.6358839050131926, "step": 1240}, {"loss": 1.2729, "grad_norm": 0.7131702303886414, "learning_rate": 0.0002, "epoch": 1.6490765171503958, "step": 1250}, {"loss": 0.9832, "grad_norm": 0.3803327977657318, "learning_rate": 0.0002, "epoch": 1.662269129287599, "step": 1260}, {"loss": 1.1433, "grad_norm": 0.3745088577270508, "learning_rate": 0.0002, "epoch": 1.675461741424802, "step": 1270}, {"loss": 1.1018, "grad_norm": 0.4427378475666046, "learning_rate": 0.0002, "epoch": 1.6886543535620053, "step": 1280}, {"loss": 1.0619, "grad_norm": 0.797478973865509, "learning_rate": 0.0002, "epoch": 1.7018469656992083, "step": 1290}, {"loss": 1.0853, "grad_norm": 0.503620982170105, "learning_rate": 0.0002, "epoch": 1.7150395778364116, "step": 1300}, {"loss": 1.1324, "grad_norm": 0.4132426381111145, "learning_rate": 0.0002, "epoch": 1.7282321899736148, "step": 1310}, {"loss": 1.172, "grad_norm": 0.41811656951904297, "learning_rate": 0.0002, "epoch": 1.741424802110818, "step": 1320}, {"loss": 1.0903, "grad_norm": 0.40647849440574646, "learning_rate": 0.0002, "epoch": 1.754617414248021, "step": 1330}, {"loss": 1.1316, "grad_norm": 0.42138347029685974, "learning_rate": 0.0002, "epoch": 1.767810026385224, "step": 1340}, {"loss": 0.9955, "grad_norm": 0.46523579955101013, "learning_rate": 0.0002, "epoch": 1.7810026385224274, "step": 1350}, {"loss": 1.0307, "grad_norm": 0.39760419726371765, "learning_rate": 0.0002, "epoch": 1.7941952506596306, "step": 1360}, {"loss": 1.1218, "grad_norm": 0.37993717193603516, "learning_rate": 0.0002, "epoch": 1.8073878627968338, "step": 1370}, {"loss": 1.0921, "grad_norm": 0.5404181480407715, "learning_rate": 0.0002, "epoch": 1.820580474934037, "step": 1380}, {"loss": 1.0903, "grad_norm": 0.4385245740413666, "learning_rate": 0.0002, "epoch": 1.83377308707124, "step": 1390}, {"loss": 1.1175, "grad_norm": 0.529797375202179, "learning_rate": 0.0002, "epoch": 1.8469656992084431, "step": 1400}, {"loss": 1.0577, "grad_norm": 0.481567919254303, "learning_rate": 0.0002, "epoch": 1.8601583113456464, "step": 1410}, {"loss": 1.0581, "grad_norm": 0.34787362813949585, "learning_rate": 0.0002, "epoch": 1.8733509234828496, "step": 1420}, {"loss": 1.0536, "grad_norm": 0.6402362585067749, "learning_rate": 0.0002, "epoch": 1.8865435356200528, "step": 1430}, {"loss": 1.0787, "grad_norm": 0.3461322784423828, "learning_rate": 0.0002, "epoch": 1.899736147757256, "step": 1440}, {"loss": 1.0925, "grad_norm": 0.44005653262138367, "learning_rate": 0.0002, "epoch": 1.912928759894459, "step": 1450}, {"loss": 1.0414, "grad_norm": 0.4064280688762665, "learning_rate": 0.0002, "epoch": 1.9261213720316621, "step": 1460}, {"loss": 1.0608, "grad_norm": 0.5236523151397705, "learning_rate": 0.0002, "epoch": 1.9393139841688654, "step": 1470}, {"loss": 1.0572, "grad_norm": 0.41030219197273254, "learning_rate": 0.0002, "epoch": 1.9525065963060686, "step": 1480}, {"loss": 1.1204, "grad_norm": 0.39805835485458374, "learning_rate": 0.0002, "epoch": 1.9656992084432718, "step": 1490}, {"loss": 1.0364, "grad_norm": 0.42974501848220825, "learning_rate": 0.0002, "epoch": 1.978891820580475, "step": 1500}, {"loss": 1.1169, "grad_norm": 0.4688243865966797, "learning_rate": 0.0002, "epoch": 1.992084432717678, "step": 1510}, {"eval_loss": 1.1874967813491821, "eval_runtime": 71.9523, "eval_samples_per_second": 5.99, "eval_steps_per_second": 0.75, "epoch": 2.0, "step": 1516}, {"loss": 1.0401, "grad_norm": 0.4121631383895874, "learning_rate": 0.0002, "epoch": 2.005277044854881, "step": 1520}, {"loss": 0.9384, "grad_norm": 0.4844197928905487, "learning_rate": 0.0002, "epoch": 2.0184696569920844, "step": 1530}, {"loss": 0.8686, "grad_norm": 0.45408546924591064, "learning_rate": 0.0002, "epoch": 2.0316622691292876, "step": 1540}, {"loss": 1.0399, "grad_norm": 0.48662951588630676, "learning_rate": 0.0002, "epoch": 2.044854881266491, "step": 1550}, {"loss": 0.936, "grad_norm": 0.7195899486541748, "learning_rate": 0.0002, "epoch": 2.058047493403694, "step": 1560}, {"loss": 0.9486, "grad_norm": 0.5071077346801758, "learning_rate": 0.0002, "epoch": 2.0712401055408973, "step": 1570}, {"loss": 1.0055, "grad_norm": 0.7473958730697632, "learning_rate": 0.0002, "epoch": 2.0844327176781, "step": 1580}, {"loss": 0.8309, "grad_norm": 0.5509232878684998, "learning_rate": 0.0002, "epoch": 2.0976253298153034, "step": 1590}, {"loss": 0.9181, "grad_norm": 0.5108042359352112, "learning_rate": 0.0002, "epoch": 2.1108179419525066, "step": 1600}, {"loss": 0.9499, "grad_norm": 0.42331448197364807, "learning_rate": 0.0002, "epoch": 2.12401055408971, "step": 1610}, {"loss": 0.9359, "grad_norm": 0.46621623635292053, "learning_rate": 0.0002, "epoch": 2.137203166226913, "step": 1620}, {"loss": 0.9065, "grad_norm": 0.43802836537361145, "learning_rate": 0.0002, "epoch": 2.150395778364116, "step": 1630}, {"loss": 0.9375, "grad_norm": 0.49908021092414856, "learning_rate": 0.0002, "epoch": 2.163588390501319, "step": 1640}, {"loss": 1.0389, "grad_norm": 0.4195636808872223, "learning_rate": 0.0002, "epoch": 2.1767810026385224, "step": 1650}, {"loss": 0.8501, "grad_norm": 0.49515822529792786, "learning_rate": 0.0002, "epoch": 2.1899736147757256, "step": 1660}, {"loss": 1.0557, "grad_norm": 0.4607589542865753, "learning_rate": 0.0002, "epoch": 2.203166226912929, "step": 1670}, {"loss": 1.0206, "grad_norm": 0.4489196836948395, "learning_rate": 0.0002, "epoch": 2.216358839050132, "step": 1680}, {"loss": 0.9657, "grad_norm": 0.49300864338874817, "learning_rate": 0.0002, "epoch": 2.229551451187335, "step": 1690}, {"loss": 0.8371, "grad_norm": 0.6624954342842102, "learning_rate": 0.0002, "epoch": 2.242744063324538, "step": 1700}, {"loss": 0.8555, "grad_norm": 0.8391500115394592, "learning_rate": 0.0002, "epoch": 2.2559366754617414, "step": 1710}, {"loss": 1.0113, "grad_norm": 0.5193073749542236, "learning_rate": 0.0002, "epoch": 2.2691292875989446, "step": 1720}, {"loss": 0.9979, "grad_norm": 0.6180613040924072, "learning_rate": 0.0002, "epoch": 2.282321899736148, "step": 1730}, {"loss": 0.9579, "grad_norm": 0.591191291809082, "learning_rate": 0.0002, "epoch": 2.295514511873351, "step": 1740}, {"loss": 0.8879, "grad_norm": 0.546897828578949, "learning_rate": 0.0002, "epoch": 2.308707124010554, "step": 1750}, {"loss": 0.9321, "grad_norm": 0.5470401644706726, "learning_rate": 0.0002, "epoch": 2.321899736147757, "step": 1760}, {"loss": 0.9104, "grad_norm": 0.4590282738208771, "learning_rate": 0.0002, "epoch": 2.3350923482849604, "step": 1770}, {"loss": 1.0384, "grad_norm": 0.622164785861969, "learning_rate": 0.0002, "epoch": 2.3482849604221636, "step": 1780}, {"loss": 1.0297, "grad_norm": 0.5753812193870544, "learning_rate": 0.0002, "epoch": 2.361477572559367, "step": 1790}, {"loss": 0.9569, "grad_norm": 0.47958624362945557, "learning_rate": 0.0002, "epoch": 2.37467018469657, "step": 1800}, {"loss": 1.0519, "grad_norm": 0.48042672872543335, "learning_rate": 0.0002, "epoch": 2.387862796833773, "step": 1810}, {"loss": 0.9042, "grad_norm": 0.44586366415023804, "learning_rate": 0.0002, "epoch": 2.401055408970976, "step": 1820}, {"loss": 0.9783, "grad_norm": 0.7239416837692261, "learning_rate": 0.0002, "epoch": 2.4142480211081794, "step": 1830}, {"loss": 0.8818, "grad_norm": 0.5515341758728027, "learning_rate": 0.0002, "epoch": 2.4274406332453826, "step": 1840}, {"loss": 0.9503, "grad_norm": 0.6280064582824707, "learning_rate": 0.0002, "epoch": 2.440633245382586, "step": 1850}, {"loss": 0.8943, "grad_norm": 0.4832057058811188, "learning_rate": 0.0002, "epoch": 2.453825857519789, "step": 1860}, {"loss": 0.8744, "grad_norm": 0.5789321064949036, "learning_rate": 0.0002, "epoch": 2.467018469656992, "step": 1870}, {"loss": 0.9332, "grad_norm": 0.48491886258125305, "learning_rate": 0.0002, "epoch": 2.480211081794195, "step": 1880}, {"loss": 0.943, "grad_norm": 0.532365620136261, "learning_rate": 0.0002, "epoch": 2.4934036939313984, "step": 1890}, {"loss": 0.9734, "grad_norm": 0.7087852954864502, "learning_rate": 0.0002, "epoch": 2.5065963060686016, "step": 1900}, {"loss": 0.9767, "grad_norm": 0.48157402873039246, "learning_rate": 0.0002, "epoch": 2.519788918205805, "step": 1910}, {"loss": 0.9851, "grad_norm": 0.5886041522026062, "learning_rate": 0.0002, "epoch": 2.532981530343008, "step": 1920}, {"loss": 1.0144, "grad_norm": 0.6332622766494751, "learning_rate": 0.0002, "epoch": 2.5461741424802113, "step": 1930}, {"loss": 0.9516, "grad_norm": 0.5463117957115173, "learning_rate": 0.0002, "epoch": 2.559366754617414, "step": 1940}, {"loss": 0.9373, "grad_norm": 0.5432228446006775, "learning_rate": 0.0002, "epoch": 2.5725593667546174, "step": 1950}, {"loss": 0.8974, "grad_norm": 0.5929186940193176, "learning_rate": 0.0002, "epoch": 2.5857519788918206, "step": 1960}, {"loss": 1.0062, "grad_norm": 0.5120641589164734, "learning_rate": 0.0002, "epoch": 2.598944591029024, "step": 1970}, {"loss": 0.9143, "grad_norm": 0.5372339487075806, "learning_rate": 0.0002, "epoch": 2.6121372031662267, "step": 1980}, {"loss": 1.0786, "grad_norm": 0.5519838929176331, "learning_rate": 0.0002, "epoch": 2.62532981530343, "step": 1990}, {"loss": 1.021, "grad_norm": 0.7304037809371948, "learning_rate": 0.0002, "epoch": 2.638522427440633, "step": 2000}, {"loss": 0.8708, "grad_norm": 0.6182340979576111, "learning_rate": 0.0002, "epoch": 2.6517150395778364, "step": 2010}, {"loss": 1.0501, "grad_norm": 0.4874444305896759, "learning_rate": 0.0002, "epoch": 2.6649076517150396, "step": 2020}, {"loss": 0.9612, "grad_norm": 0.5850239396095276, "learning_rate": 0.0002, "epoch": 2.678100263852243, "step": 2030}, {"loss": 0.9808, "grad_norm": 0.6495311856269836, "learning_rate": 0.0002, "epoch": 2.691292875989446, "step": 2040}, {"loss": 1.0187, "grad_norm": 1.002830147743225, "learning_rate": 0.0002, "epoch": 2.7044854881266494, "step": 2050}, {"loss": 1.0386, "grad_norm": 0.49076753854751587, "learning_rate": 0.0002, "epoch": 2.717678100263852, "step": 2060}, {"loss": 0.9416, "grad_norm": 0.4736326336860657, "learning_rate": 0.0002, "epoch": 2.7308707124010554, "step": 2070}, {"loss": 0.8424, "grad_norm": 0.5527601838111877, "learning_rate": 0.0002, "epoch": 2.7440633245382586, "step": 2080}, {"loss": 0.9149, "grad_norm": 0.7295718193054199, "learning_rate": 0.0002, "epoch": 2.757255936675462, "step": 2090}, {"loss": 0.9032, "grad_norm": 0.5437536835670471, "learning_rate": 0.0002, "epoch": 2.7704485488126647, "step": 2100}, {"loss": 0.8254, "grad_norm": 0.5997128486633301, "learning_rate": 0.0002, "epoch": 2.783641160949868, "step": 2110}, {"loss": 0.976, "grad_norm": 0.6498191356658936, "learning_rate": 0.0002, "epoch": 2.796833773087071, "step": 2120}, {"loss": 0.9543, "grad_norm": 0.5237268805503845, "learning_rate": 0.0002, "epoch": 2.8100263852242744, "step": 2130}, {"loss": 0.9302, "grad_norm": 0.6033027172088623, "learning_rate": 0.0002, "epoch": 2.8232189973614776, "step": 2140}, {"loss": 0.9625, "grad_norm": 0.6077138781547546, "learning_rate": 0.0002, "epoch": 2.836411609498681, "step": 2150}, {"loss": 0.9347, "grad_norm": 0.4127797484397888, "learning_rate": 0.0002, "epoch": 2.849604221635884, "step": 2160}, {"loss": 1.0459, "grad_norm": 0.8448635339736938, "learning_rate": 0.0002, "epoch": 2.862796833773087, "step": 2170}, {"loss": 0.8185, "grad_norm": 0.5669729113578796, "learning_rate": 0.0002, "epoch": 2.87598944591029, "step": 2180}, {"loss": 0.8555, "grad_norm": 0.510231077671051, "learning_rate": 0.0002, "epoch": 2.8891820580474934, "step": 2190}, {"loss": 0.9267, "grad_norm": 0.8072245121002197, "learning_rate": 0.0002, "epoch": 2.9023746701846966, "step": 2200}, {"loss": 0.9685, "grad_norm": 0.6055923104286194, "learning_rate": 0.0002, "epoch": 2.9155672823219, "step": 2210}, {"loss": 0.9157, "grad_norm": 0.7384416460990906, "learning_rate": 0.0002, "epoch": 2.9287598944591027, "step": 2220}, {"loss": 0.9712, "grad_norm": 0.4922751784324646, "learning_rate": 0.0002, "epoch": 2.941952506596306, "step": 2230}, {"loss": 0.9487, "grad_norm": 0.6039906740188599, "learning_rate": 0.0002, "epoch": 2.955145118733509, "step": 2240}, {"loss": 0.9776, "grad_norm": 0.4751701354980469, "learning_rate": 0.0002, "epoch": 2.9683377308707124, "step": 2250}, {"loss": 1.0619, "grad_norm": 0.5698353052139282, "learning_rate": 0.0002, "epoch": 2.9815303430079156, "step": 2260}, {"loss": 1.1184, "grad_norm": 0.893563449382782, "learning_rate": 0.0002, "epoch": 2.994722955145119, "step": 2270}, {"eval_loss": 1.2046419382095337, "eval_runtime": 71.5992, "eval_samples_per_second": 6.02, "eval_steps_per_second": 0.754, "epoch": 3.0, "step": 2274}, {"loss": 0.8269, "grad_norm": 0.41119325160980225, "learning_rate": 0.0002, "epoch": 3.007915567282322, "step": 2280}, {"loss": 0.7856, "grad_norm": 0.8169420957565308, "learning_rate": 0.0002, "epoch": 3.021108179419525, "step": 2290}, {"loss": 0.794, "grad_norm": 0.6033818125724792, "learning_rate": 0.0002, "epoch": 3.034300791556728, "step": 2300}, {"loss": 0.7607, "grad_norm": 0.9600058197975159, "learning_rate": 0.0002, "epoch": 3.0474934036939314, "step": 2310}, {"loss": 0.8353, "grad_norm": 0.5859250426292419, "learning_rate": 0.0002, "epoch": 3.0606860158311346, "step": 2320}, {"loss": 0.7598, "grad_norm": 0.6758618950843811, "learning_rate": 0.0002, "epoch": 3.073878627968338, "step": 2330}, {"loss": 0.7631, "grad_norm": 0.8407140970230103, "learning_rate": 0.0002, "epoch": 3.0870712401055407, "step": 2340}, {"loss": 0.7664, "grad_norm": 0.767779529094696, "learning_rate": 0.0002, "epoch": 3.100263852242744, "step": 2350}, {"loss": 0.7121, "grad_norm": 0.5572896599769592, "learning_rate": 0.0002, "epoch": 3.113456464379947, "step": 2360}, {"loss": 0.7419, "grad_norm": 0.5908368825912476, "learning_rate": 0.0002, "epoch": 3.1266490765171504, "step": 2370}, {"loss": 0.8024, "grad_norm": 0.8047826290130615, "learning_rate": 0.0002, "epoch": 3.1398416886543536, "step": 2380}, {"loss": 0.8686, "grad_norm": 0.8041718006134033, "learning_rate": 0.0002, "epoch": 3.153034300791557, "step": 2390}, {"loss": 0.668, "grad_norm": 0.57078617811203, "learning_rate": 0.0002, "epoch": 3.16622691292876, "step": 2400}, {"loss": 0.7976, "grad_norm": 0.5125322937965393, "learning_rate": 0.0002, "epoch": 3.179419525065963, "step": 2410}, {"loss": 0.741, "grad_norm": 0.6356934309005737, "learning_rate": 0.0002, "epoch": 3.192612137203166, "step": 2420}, {"loss": 0.687, "grad_norm": 1.0129680633544922, "learning_rate": 0.0002, "epoch": 3.2058047493403694, "step": 2430}, {"loss": 0.8316, "grad_norm": 0.8104226589202881, "learning_rate": 0.0002, "epoch": 3.2189973614775726, "step": 2440}, {"loss": 0.8343, "grad_norm": 0.7276079058647156, "learning_rate": 0.0002, "epoch": 3.232189973614776, "step": 2450}, {"loss": 0.8183, "grad_norm": 0.9753884077072144, "learning_rate": 0.0002, "epoch": 3.2453825857519787, "step": 2460}, {"loss": 0.7776, "grad_norm": 0.9753183722496033, "learning_rate": 0.0002, "epoch": 3.258575197889182, "step": 2470}, {"loss": 0.8815, "grad_norm": 0.6791225075721741, "learning_rate": 0.0002, "epoch": 3.271767810026385, "step": 2480}, {"loss": 0.7548, "grad_norm": 0.6797150373458862, "learning_rate": 0.0002, "epoch": 3.2849604221635884, "step": 2490}, {"loss": 0.8395, "grad_norm": 0.8107194900512695, "learning_rate": 0.0002, "epoch": 3.2981530343007917, "step": 2500}, {"loss": 0.7869, "grad_norm": 0.5878375172615051, "learning_rate": 0.0002, "epoch": 3.311345646437995, "step": 2510}, {"loss": 0.7992, "grad_norm": 0.5882975459098816, "learning_rate": 0.0002, "epoch": 3.324538258575198, "step": 2520}, {"loss": 0.7472, "grad_norm": 0.6180013418197632, "learning_rate": 0.0002, "epoch": 3.337730870712401, "step": 2530}, {"loss": 0.8033, "grad_norm": 1.0008151531219482, "learning_rate": 0.0002, "epoch": 3.350923482849604, "step": 2540}, {"loss": 0.8464, "grad_norm": 0.6404656767845154, "learning_rate": 0.0002, "epoch": 3.3641160949868074, "step": 2550}, {"loss": 0.7533, "grad_norm": 0.8481354117393494, "learning_rate": 0.0002, "epoch": 3.3773087071240107, "step": 2560}, {"loss": 0.7852, "grad_norm": 0.8068035244941711, "learning_rate": 0.0002, "epoch": 3.390501319261214, "step": 2570}, {"loss": 0.8621, "grad_norm": 0.7477166056632996, "learning_rate": 0.0002, "epoch": 3.4036939313984167, "step": 2580}, {"loss": 0.8352, "grad_norm": 0.6202635765075684, "learning_rate": 0.0002, "epoch": 3.41688654353562, "step": 2590}, {"loss": 0.7572, "grad_norm": 0.6981159448623657, "learning_rate": 0.0002, "epoch": 3.430079155672823, "step": 2600}, {"loss": 0.7846, "grad_norm": 0.6611084342002869, "learning_rate": 0.0002, "epoch": 3.4432717678100264, "step": 2610}, {"loss": 0.7503, "grad_norm": 0.5727696418762207, "learning_rate": 0.0002, "epoch": 3.4564643799472297, "step": 2620}, {"loss": 0.8427, "grad_norm": 1.2354545593261719, "learning_rate": 0.0002, "epoch": 3.469656992084433, "step": 2630}, {"loss": 0.7747, "grad_norm": 0.6347638368606567, "learning_rate": 0.0002, "epoch": 3.4828496042216357, "step": 2640}, {"loss": 0.8426, "grad_norm": 0.6975704431533813, "learning_rate": 0.0002, "epoch": 3.496042216358839, "step": 2650}, {"loss": 0.8773, "grad_norm": 0.6569573879241943, "learning_rate": 0.0002, "epoch": 3.509234828496042, "step": 2660}, {"loss": 0.7908, "grad_norm": 0.6979609131813049, "learning_rate": 0.0002, "epoch": 3.5224274406332454, "step": 2670}, {"loss": 0.8254, "grad_norm": 0.6287988424301147, "learning_rate": 0.0002, "epoch": 3.5356200527704487, "step": 2680}, {"loss": 0.7815, "grad_norm": 0.8682637214660645, "learning_rate": 0.0002, "epoch": 3.5488126649076515, "step": 2690}, {"loss": 0.7566, "grad_norm": 0.7062831521034241, "learning_rate": 0.0002, "epoch": 3.5620052770448547, "step": 2700}, {"loss": 0.713, "grad_norm": 1.0061452388763428, "learning_rate": 0.0002, "epoch": 3.575197889182058, "step": 2710}, {"loss": 0.7738, "grad_norm": 0.719097375869751, "learning_rate": 0.0002, "epoch": 3.588390501319261, "step": 2720}, {"loss": 0.8145, "grad_norm": 0.7583496570587158, "learning_rate": 0.0002, "epoch": 3.6015831134564644, "step": 2730}, {"loss": 0.91, "grad_norm": 0.7543531060218811, "learning_rate": 0.0002, "epoch": 3.6147757255936677, "step": 2740}, {"loss": 0.8325, "grad_norm": 0.8873646855354309, "learning_rate": 0.0002, "epoch": 3.627968337730871, "step": 2750}, {"loss": 0.7116, "grad_norm": 1.0657562017440796, "learning_rate": 0.0002, "epoch": 3.641160949868074, "step": 2760}, {"loss": 0.8291, "grad_norm": 0.8641113638877869, "learning_rate": 0.0002, "epoch": 3.654353562005277, "step": 2770}, {"loss": 0.8302, "grad_norm": 0.6620645523071289, "learning_rate": 0.0002, "epoch": 3.66754617414248, "step": 2780}, {"loss": 0.8261, "grad_norm": 0.6919541954994202, "learning_rate": 0.0002, "epoch": 3.6807387862796834, "step": 2790}, {"loss": 0.8388, "grad_norm": 0.7305743098258972, "learning_rate": 0.0002, "epoch": 3.6939313984168867, "step": 2800}, {"loss": 0.8053, "grad_norm": 0.7464777827262878, "learning_rate": 0.0002, "epoch": 3.7071240105540895, "step": 2810}, {"loss": 0.8019, "grad_norm": 0.8067063093185425, "learning_rate": 0.0002, "epoch": 3.7203166226912927, "step": 2820}, {"loss": 0.8259, "grad_norm": 0.7789416313171387, "learning_rate": 0.0002, "epoch": 3.733509234828496, "step": 2830}, {"loss": 0.774, "grad_norm": 0.507529079914093, "learning_rate": 0.0002, "epoch": 3.746701846965699, "step": 2840}, {"loss": 0.832, "grad_norm": 0.6509260535240173, "learning_rate": 0.0002, "epoch": 3.7598944591029024, "step": 2850}, {"loss": 0.8257, "grad_norm": 0.9141367673873901, "learning_rate": 0.0002, "epoch": 3.7730870712401057, "step": 2860}, {"loss": 0.9436, "grad_norm": 0.7852635979652405, "learning_rate": 0.0002, "epoch": 3.786279683377309, "step": 2870}, {"loss": 0.8842, "grad_norm": 0.5340318083763123, "learning_rate": 0.0002, "epoch": 3.7994722955145117, "step": 2880}, {"loss": 0.7468, "grad_norm": 0.6246042847633362, "learning_rate": 0.0002, "epoch": 3.812664907651715, "step": 2890}, {"loss": 0.8184, "grad_norm": 0.7064066529273987, "learning_rate": 0.0002, "epoch": 3.825857519788918, "step": 2900}, {"loss": 0.8515, "grad_norm": 0.6144065856933594, "learning_rate": 0.0002, "epoch": 3.8390501319261214, "step": 2910}, {"loss": 0.7484, "grad_norm": 0.5268424153327942, "learning_rate": 0.0002, "epoch": 3.8522427440633247, "step": 2920}, {"loss": 0.7594, "grad_norm": 0.9508116841316223, "learning_rate": 0.0002, "epoch": 3.8654353562005275, "step": 2930}, {"loss": 0.8437, "grad_norm": 0.9133715629577637, "learning_rate": 0.0002, "epoch": 3.8786279683377307, "step": 2940}, {"loss": 0.8611, "grad_norm": 1.0144646167755127, "learning_rate": 0.0002, "epoch": 3.891820580474934, "step": 2950}, {"loss": 0.8043, "grad_norm": 0.6397877931594849, "learning_rate": 0.0002, "epoch": 3.905013192612137, "step": 2960}, {"loss": 0.8285, "grad_norm": 0.734835147857666, "learning_rate": 0.0002, "epoch": 3.9182058047493404, "step": 2970}, {"loss": 0.7831, "grad_norm": 0.784853994846344, "learning_rate": 0.0002, "epoch": 3.9313984168865437, "step": 2980}, {"loss": 0.8148, "grad_norm": 0.805831789970398, "learning_rate": 0.0002, "epoch": 3.944591029023747, "step": 2990}, {"loss": 0.8252, "grad_norm": 0.6299595236778259, "learning_rate": 0.0002, "epoch": 3.9577836411609497, "step": 3000}, {"loss": 0.8244, "grad_norm": 0.6264058351516724, "learning_rate": 0.0002, "epoch": 3.970976253298153, "step": 3010}, {"loss": 0.8185, "grad_norm": 0.6419739723205566, "learning_rate": 0.0002, "epoch": 3.984168865435356, "step": 3020}, {"loss": 0.8174, "grad_norm": 0.7737036943435669, "learning_rate": 0.0002, "epoch": 3.9973614775725594, "step": 3030}, {"eval_loss": 1.2454297542572021, "eval_runtime": 71.8558, "eval_samples_per_second": 5.998, "eval_steps_per_second": 0.752, "epoch": 4.0, "step": 3032}, {"loss": 0.6716, "grad_norm": 1.092727541923523, "learning_rate": 0.0002, "epoch": 4.010554089709762, "step": 3040}, {"loss": 0.596, "grad_norm": 0.8087759613990784, "learning_rate": 0.0002, "epoch": 4.0237467018469655, "step": 3050}, {"loss": 0.7055, "grad_norm": 0.8106053471565247, "learning_rate": 0.0002, "epoch": 4.036939313984169, "step": 3060}, {"loss": 0.6846, "grad_norm": 0.8675326704978943, "learning_rate": 0.0002, "epoch": 4.050131926121372, "step": 3070}, {"loss": 0.6064, "grad_norm": 0.9620490074157715, "learning_rate": 0.0002, "epoch": 4.063324538258575, "step": 3080}, {"loss": 0.6047, "grad_norm": 0.8996296525001526, "learning_rate": 0.0002, "epoch": 4.076517150395778, "step": 3090}, {"loss": 0.6111, "grad_norm": 0.8648998737335205, "learning_rate": 0.0002, "epoch": 4.089709762532982, "step": 3100}, {"loss": 0.5853, "grad_norm": 1.0321335792541504, "learning_rate": 0.0002, "epoch": 4.102902374670185, "step": 3110}, {"loss": 0.6161, "grad_norm": 0.7949225306510925, "learning_rate": 0.0002, "epoch": 4.116094986807388, "step": 3120}, {"loss": 0.6354, "grad_norm": 0.9684646129608154, "learning_rate": 0.0002, "epoch": 4.129287598944591, "step": 3130}, {"loss": 0.6198, "grad_norm": 0.8698066473007202, "learning_rate": 0.0002, "epoch": 4.142480211081795, "step": 3140}, {"loss": 0.7185, "grad_norm": 0.7688450813293457, "learning_rate": 0.0002, "epoch": 4.155672823218997, "step": 3150}, {"loss": 0.6053, "grad_norm": 0.9682092070579529, "learning_rate": 0.0002, "epoch": 4.1688654353562, "step": 3160}, {"loss": 0.6827, "grad_norm": 0.961561918258667, "learning_rate": 0.0002, "epoch": 4.1820580474934035, "step": 3170}, {"loss": 0.6403, "grad_norm": 1.3962990045547485, "learning_rate": 0.0002, "epoch": 4.195250659630607, "step": 3180}, {"loss": 0.6319, "grad_norm": 0.9485045075416565, "learning_rate": 0.0002, "epoch": 4.20844327176781, "step": 3190}, {"loss": 0.5908, "grad_norm": 0.7768281698226929, "learning_rate": 0.0002, "epoch": 4.221635883905013, "step": 3200}, {"loss": 0.6365, "grad_norm": 1.2685691118240356, "learning_rate": 0.0002, "epoch": 4.2348284960422165, "step": 3210}, {"loss": 0.6601, "grad_norm": 0.6876471638679504, "learning_rate": 0.0002, "epoch": 4.24802110817942, "step": 3220}, {"loss": 0.6274, "grad_norm": 1.0074554681777954, "learning_rate": 0.0002, "epoch": 4.261213720316623, "step": 3230}, {"loss": 0.6027, "grad_norm": 0.8094777464866638, "learning_rate": 0.0002, "epoch": 4.274406332453826, "step": 3240}, {"loss": 0.643, "grad_norm": 0.7906569242477417, "learning_rate": 0.0002, "epoch": 4.287598944591029, "step": 3250}, {"loss": 0.5909, "grad_norm": 0.840238630771637, "learning_rate": 0.0002, "epoch": 4.300791556728232, "step": 3260}, {"loss": 0.5943, "grad_norm": 1.0119295120239258, "learning_rate": 0.0002, "epoch": 4.313984168865435, "step": 3270}, {"loss": 0.5912, "grad_norm": 0.7943191528320312, "learning_rate": 0.0002, "epoch": 4.327176781002638, "step": 3280}, {"loss": 0.6235, "grad_norm": 0.7691723704338074, "learning_rate": 0.0002, "epoch": 4.3403693931398415, "step": 3290}, {"loss": 0.6173, "grad_norm": 0.7227770686149597, "learning_rate": 0.0002, "epoch": 4.353562005277045, "step": 3300}, {"loss": 0.6047, "grad_norm": 0.8512253165245056, "learning_rate": 0.0002, "epoch": 4.366754617414248, "step": 3310}, {"loss": 0.5849, "grad_norm": 0.7852529287338257, "learning_rate": 0.0002, "epoch": 4.379947229551451, "step": 3320}, {"loss": 0.6416, "grad_norm": 0.8888797163963318, "learning_rate": 0.0002, "epoch": 4.3931398416886545, "step": 3330}, {"loss": 0.6804, "grad_norm": 0.9522430896759033, "learning_rate": 0.0002, "epoch": 4.406332453825858, "step": 3340}, {"loss": 0.6345, "grad_norm": 0.900276780128479, "learning_rate": 0.0002, "epoch": 4.419525065963061, "step": 3350}, {"loss": 0.7055, "grad_norm": 1.181547999382019, "learning_rate": 0.0002, "epoch": 4.432717678100264, "step": 3360}, {"loss": 0.7073, "grad_norm": 0.903142511844635, "learning_rate": 0.0002, "epoch": 4.445910290237467, "step": 3370}, {"loss": 0.7235, "grad_norm": 0.8747565150260925, "learning_rate": 0.0002, "epoch": 4.45910290237467, "step": 3380}, {"loss": 0.7071, "grad_norm": 0.7838051319122314, "learning_rate": 0.0002, "epoch": 4.472295514511873, "step": 3390}, {"loss": 0.5932, "grad_norm": 0.8691313862800598, "learning_rate": 0.0002, "epoch": 4.485488126649076, "step": 3400}, {"loss": 0.7019, "grad_norm": 0.8493868708610535, "learning_rate": 0.0002, "epoch": 4.4986807387862795, "step": 3410}, {"loss": 0.5959, "grad_norm": 1.0104830265045166, "learning_rate": 0.0002, "epoch": 4.511873350923483, "step": 3420}, {"loss": 0.6662, "grad_norm": 1.1716967821121216, "learning_rate": 0.0002, "epoch": 4.525065963060686, "step": 3430}, {"loss": 0.6411, "grad_norm": 0.9122593998908997, "learning_rate": 0.0002, "epoch": 4.538258575197889, "step": 3440}, {"loss": 0.7047, "grad_norm": 0.829090416431427, "learning_rate": 0.0002, "epoch": 4.5514511873350925, "step": 3450}, {"loss": 0.6001, "grad_norm": 1.141662836074829, "learning_rate": 0.0002, "epoch": 4.564643799472296, "step": 3460}, {"loss": 0.6612, "grad_norm": 0.8423182368278503, "learning_rate": 0.0002, "epoch": 4.577836411609499, "step": 3470}, {"loss": 0.6797, "grad_norm": 0.8024184703826904, "learning_rate": 0.0002, "epoch": 4.591029023746702, "step": 3480}, {"loss": 0.7184, "grad_norm": 0.7703381776809692, "learning_rate": 0.0002, "epoch": 4.6042216358839045, "step": 3490}, {"loss": 0.7001, "grad_norm": 0.9883959293365479, "learning_rate": 0.0002, "epoch": 4.617414248021108, "step": 3500}, {"loss": 0.6188, "grad_norm": 0.9554709196090698, "learning_rate": 0.0002, "epoch": 4.630606860158311, "step": 3510}, {"loss": 0.7378, "grad_norm": 1.9949709177017212, "learning_rate": 0.0002, "epoch": 4.643799472295514, "step": 3520}, {"loss": 0.6678, "grad_norm": 0.7762255072593689, "learning_rate": 0.0002, "epoch": 4.6569920844327175, "step": 3530}, {"loss": 0.6298, "grad_norm": 0.9538425803184509, "learning_rate": 0.0002, "epoch": 4.670184696569921, "step": 3540}, {"loss": 0.6352, "grad_norm": 1.0279661417007446, "learning_rate": 0.0002, "epoch": 4.683377308707124, "step": 3550}, {"loss": 0.6641, "grad_norm": 0.7545472979545593, "learning_rate": 0.0002, "epoch": 4.696569920844327, "step": 3560}, {"loss": 0.6887, "grad_norm": 0.8919376730918884, "learning_rate": 0.0002, "epoch": 4.7097625329815305, "step": 3570}, {"loss": 0.6395, "grad_norm": 0.7621569633483887, "learning_rate": 0.0002, "epoch": 4.722955145118734, "step": 3580}, {"loss": 0.6928, "grad_norm": 1.205320119857788, "learning_rate": 0.0002, "epoch": 4.736147757255937, "step": 3590}, {"loss": 0.6612, "grad_norm": 1.0642725229263306, "learning_rate": 0.0002, "epoch": 4.74934036939314, "step": 3600}, {"loss": 0.6541, "grad_norm": 0.9402666687965393, "learning_rate": 0.0002, "epoch": 4.762532981530343, "step": 3610}, {"loss": 0.6395, "grad_norm": 1.254127025604248, "learning_rate": 0.0002, "epoch": 4.775725593667546, "step": 3620}, {"loss": 0.692, "grad_norm": 0.7609598636627197, "learning_rate": 0.0002, "epoch": 4.788918205804749, "step": 3630}, {"loss": 0.6578, "grad_norm": 0.8240329623222351, "learning_rate": 0.0002, "epoch": 4.802110817941952, "step": 3640}, {"loss": 0.7383, "grad_norm": 0.8356260657310486, "learning_rate": 0.0002, "epoch": 4.8153034300791555, "step": 3650}, {"loss": 0.6368, "grad_norm": 0.9130708575248718, "learning_rate": 0.0002, "epoch": 4.828496042216359, "step": 3660}, {"loss": 0.7269, "grad_norm": 0.9384765028953552, "learning_rate": 0.0002, "epoch": 4.841688654353562, "step": 3670}, {"loss": 0.6509, "grad_norm": 0.9829966425895691, "learning_rate": 0.0002, "epoch": 4.854881266490765, "step": 3680}, {"loss": 0.6311, "grad_norm": 1.0488632917404175, "learning_rate": 0.0002, "epoch": 4.8680738786279685, "step": 3690}, {"loss": 0.7005, "grad_norm": 1.2278969287872314, "learning_rate": 0.0002, "epoch": 4.881266490765172, "step": 3700}, {"loss": 0.6869, "grad_norm": 0.8078970313072205, "learning_rate": 0.0002, "epoch": 4.894459102902375, "step": 3710}, {"loss": 0.6588, "grad_norm": 0.8081700205802917, "learning_rate": 0.0002, "epoch": 4.907651715039578, "step": 3720}, {"loss": 0.7189, "grad_norm": 0.9204511046409607, "learning_rate": 0.0002, "epoch": 4.9208443271767806, "step": 3730}, {"loss": 0.6953, "grad_norm": 0.9326391220092773, "learning_rate": 0.0002, "epoch": 4.934036939313984, "step": 3740}, {"loss": 0.68, "grad_norm": 1.0089969635009766, "learning_rate": 0.0002, "epoch": 4.947229551451187, "step": 3750}, {"loss": 0.7031, "grad_norm": 0.7063466906547546, "learning_rate": 0.0002, "epoch": 4.96042216358839, "step": 3760}, {"loss": 0.6568, "grad_norm": 1.2603905200958252, "learning_rate": 0.0002, "epoch": 4.9736147757255935, "step": 3770}, {"loss": 0.7134, "grad_norm": 0.8418653607368469, "learning_rate": 0.0002, "epoch": 4.986807387862797, "step": 3780}, {"loss": 0.6683, "grad_norm": 0.9537181854248047, "learning_rate": 0.0002, "epoch": 5.0, "step": 3790}, {"eval_loss": 1.3319307565689087, "eval_runtime": 71.7836, "eval_samples_per_second": 6.004, "eval_steps_per_second": 0.752, "epoch": 5.0, "step": 3790}, {"loss": 0.489, "grad_norm": 0.8595899343490601, "learning_rate": 0.0002, "epoch": 5.013192612137203, "step": 3800}, {"loss": 0.5155, "grad_norm": 1.0023565292358398, "learning_rate": 0.0002, "epoch": 5.0263852242744065, "step": 3810}, {"loss": 0.5321, "grad_norm": 1.2770460844039917, "learning_rate": 0.0002, "epoch": 5.03957783641161, "step": 3820}, {"loss": 0.5127, "grad_norm": 1.1701956987380981, "learning_rate": 0.0002, "epoch": 5.052770448548813, "step": 3830}, {"loss": 0.5057, "grad_norm": 0.812269926071167, "learning_rate": 0.0002, "epoch": 5.065963060686016, "step": 3840}, {"loss": 0.4292, "grad_norm": 0.8186697363853455, "learning_rate": 0.0002, "epoch": 5.0791556728232194, "step": 3850}, {"loss": 0.4865, "grad_norm": 1.052565097808838, "learning_rate": 0.0002, "epoch": 5.092348284960422, "step": 3860}, {"loss": 0.4947, "grad_norm": 0.9764705300331116, "learning_rate": 0.0002, "epoch": 5.105540897097625, "step": 3870}, {"loss": 0.471, "grad_norm": 0.6973426938056946, "learning_rate": 0.0002, "epoch": 5.118733509234828, "step": 3880}, {"loss": 0.5565, "grad_norm": 1.2127928733825684, "learning_rate": 0.0002, "epoch": 5.1319261213720315, "step": 3890}, {"loss": 0.4122, "grad_norm": 0.682807981967926, "learning_rate": 0.0002, "epoch": 5.145118733509235, "step": 3900}, {"loss": 0.6378, "grad_norm": 1.3575998544692993, "learning_rate": 0.0002, "epoch": 5.158311345646438, "step": 3910}, {"loss": 0.4624, "grad_norm": 1.2581931352615356, "learning_rate": 0.0002, "epoch": 5.171503957783641, "step": 3920}, {"loss": 0.5092, "grad_norm": 1.0493637323379517, "learning_rate": 0.0002, "epoch": 5.1846965699208445, "step": 3930}, {"loss": 0.4563, "grad_norm": 1.3519670963287354, "learning_rate": 0.0002, "epoch": 5.197889182058048, "step": 3940}, {"loss": 0.5414, "grad_norm": 1.0690566301345825, "learning_rate": 0.0002, "epoch": 5.211081794195251, "step": 3950}, {"loss": 0.5038, "grad_norm": 1.1171330213546753, "learning_rate": 0.0002, "epoch": 5.224274406332454, "step": 3960}, {"loss": 0.4397, "grad_norm": 1.055851697921753, "learning_rate": 0.0002, "epoch": 5.237467018469657, "step": 3970}, {"loss": 0.4964, "grad_norm": 0.8870180249214172, "learning_rate": 0.0002, "epoch": 5.25065963060686, "step": 3980}, {"loss": 0.5353, "grad_norm": 0.9688402414321899, "learning_rate": 0.0002, "epoch": 5.263852242744063, "step": 3990}, {"loss": 0.5192, "grad_norm": 0.8458422422409058, "learning_rate": 0.0002, "epoch": 5.277044854881266, "step": 4000}, {"loss": 0.5458, "grad_norm": 0.908256471157074, "learning_rate": 0.0002, "epoch": 5.2902374670184695, "step": 4010}, {"loss": 0.5102, "grad_norm": 1.0058149099349976, "learning_rate": 0.0002, "epoch": 5.303430079155673, "step": 4020}, {"loss": 0.5322, "grad_norm": 1.20364511013031, "learning_rate": 0.0002, "epoch": 5.316622691292876, "step": 4030}, {"loss": 0.5715, "grad_norm": 1.0135732889175415, "learning_rate": 0.0002, "epoch": 5.329815303430079, "step": 4040}, {"loss": 0.4736, "grad_norm": 1.1094907522201538, "learning_rate": 0.0002, "epoch": 5.3430079155672825, "step": 4050}, {"loss": 0.4912, "grad_norm": 1.0373083353042603, "learning_rate": 0.0002, "epoch": 5.356200527704486, "step": 4060}, {"loss": 0.5258, "grad_norm": 1.0952966213226318, "learning_rate": 0.0002, "epoch": 5.369393139841689, "step": 4070}, {"loss": 0.4892, "grad_norm": 1.1734952926635742, "learning_rate": 0.0002, "epoch": 5.382585751978892, "step": 4080}, {"loss": 0.4463, "grad_norm": 0.8217245936393738, "learning_rate": 0.0002, "epoch": 5.395778364116095, "step": 4090}, {"loss": 0.5271, "grad_norm": 1.0936307907104492, "learning_rate": 0.0002, "epoch": 5.408970976253298, "step": 4100}, {"loss": 0.509, "grad_norm": 1.0198720693588257, "learning_rate": 0.0002, "epoch": 5.422163588390501, "step": 4110}, {"loss": 0.5265, "grad_norm": 1.1105809211730957, "learning_rate": 0.0002, "epoch": 5.435356200527704, "step": 4120}, {"loss": 0.4871, "grad_norm": 1.1817213296890259, "learning_rate": 0.0002, "epoch": 5.4485488126649075, "step": 4130}, {"loss": 0.4987, "grad_norm": 1.126339077949524, "learning_rate": 0.0002, "epoch": 5.461741424802111, "step": 4140}, {"loss": 0.5743, "grad_norm": 0.9467914700508118, "learning_rate": 0.0002, "epoch": 5.474934036939314, "step": 4150}, {"loss": 0.5386, "grad_norm": 1.0335774421691895, "learning_rate": 0.0002, "epoch": 5.488126649076517, "step": 4160}, {"loss": 0.5122, "grad_norm": 0.866211473941803, "learning_rate": 0.0002, "epoch": 5.5013192612137205, "step": 4170}, {"loss": 0.5697, "grad_norm": 0.7422948479652405, "learning_rate": 0.0002, "epoch": 5.514511873350924, "step": 4180}, {"loss": 0.586, "grad_norm": 1.2211135625839233, "learning_rate": 0.0002, "epoch": 5.527704485488127, "step": 4190}, {"loss": 0.5476, "grad_norm": 1.0371766090393066, "learning_rate": 0.0002, "epoch": 5.540897097625329, "step": 4200}, {"loss": 0.5941, "grad_norm": 0.9460630416870117, "learning_rate": 0.0002, "epoch": 5.554089709762533, "step": 4210}, {"loss": 0.4645, "grad_norm": 0.7972197532653809, "learning_rate": 0.0002, "epoch": 5.567282321899736, "step": 4220}, {"loss": 0.5087, "grad_norm": 1.0654675960540771, "learning_rate": 0.0002, "epoch": 5.580474934036939, "step": 4230}, {"loss": 0.5957, "grad_norm": 1.0776735544204712, "learning_rate": 0.0002, "epoch": 5.593667546174142, "step": 4240}, {"loss": 0.53, "grad_norm": 1.498723030090332, "learning_rate": 0.0002, "epoch": 5.6068601583113455, "step": 4250}, {"loss": 0.4788, "grad_norm": 1.006768822669983, "learning_rate": 0.0002, "epoch": 5.620052770448549, "step": 4260}, {"loss": 0.5571, "grad_norm": 0.9194242358207703, "learning_rate": 0.0002, "epoch": 5.633245382585752, "step": 4270}, {"loss": 0.5722, "grad_norm": 1.1028380393981934, "learning_rate": 0.0002, "epoch": 5.646437994722955, "step": 4280}, {"loss": 0.5319, "grad_norm": 0.9972755312919617, "learning_rate": 0.0002, "epoch": 5.6596306068601585, "step": 4290}, {"loss": 0.53, "grad_norm": 1.0509438514709473, "learning_rate": 0.0002, "epoch": 5.672823218997362, "step": 4300}, {"loss": 0.4738, "grad_norm": 1.064039945602417, "learning_rate": 0.0002, "epoch": 5.686015831134565, "step": 4310}, {"loss": 0.5401, "grad_norm": 0.9572229981422424, "learning_rate": 0.0002, "epoch": 5.699208443271768, "step": 4320}, {"loss": 0.5173, "grad_norm": 0.9956564903259277, "learning_rate": 0.0002, "epoch": 5.7124010554089715, "step": 4330}, {"loss": 0.6008, "grad_norm": 1.01974618434906, "learning_rate": 0.0002, "epoch": 5.725593667546174, "step": 4340}, {"loss": 0.5111, "grad_norm": 1.101328730583191, "learning_rate": 0.0002, "epoch": 5.738786279683377, "step": 4350}, {"loss": 0.5921, "grad_norm": 0.9971756935119629, "learning_rate": 0.0002, "epoch": 5.75197889182058, "step": 4360}, {"loss": 0.5262, "grad_norm": 0.8579474687576294, "learning_rate": 0.0002, "epoch": 5.7651715039577835, "step": 4370}, {"loss": 0.5106, "grad_norm": 0.9927367568016052, "learning_rate": 0.0002, "epoch": 5.778364116094987, "step": 4380}, {"loss": 0.5354, "grad_norm": 1.1183884143829346, "learning_rate": 0.0002, "epoch": 5.79155672823219, "step": 4390}, {"loss": 0.5658, "grad_norm": 0.7695905566215515, "learning_rate": 0.0002, "epoch": 5.804749340369393, "step": 4400}, {"loss": 0.5137, "grad_norm": 1.1102122068405151, "learning_rate": 0.0002, "epoch": 5.8179419525065965, "step": 4410}, {"loss": 0.5634, "grad_norm": 1.3201336860656738, "learning_rate": 0.0002, "epoch": 5.8311345646438, "step": 4420}, {"loss": 0.5773, "grad_norm": 1.1934558153152466, "learning_rate": 0.0002, "epoch": 5.844327176781003, "step": 4430}, {"loss": 0.6338, "grad_norm": 1.390870451927185, "learning_rate": 0.0002, "epoch": 5.857519788918205, "step": 4440}, {"loss": 0.5625, "grad_norm": 1.056314468383789, "learning_rate": 0.0002, "epoch": 5.870712401055409, "step": 4450}, {"loss": 0.6456, "grad_norm": 0.9797437191009521, "learning_rate": 0.0002, "epoch": 5.883905013192612, "step": 4460}, {"loss": 0.5479, "grad_norm": 1.2368146181106567, "learning_rate": 0.0002, "epoch": 5.897097625329815, "step": 4470}, {"loss": 0.5453, "grad_norm": 0.9062654376029968, "learning_rate": 0.0002, "epoch": 5.910290237467018, "step": 4480}, {"loss": 0.5857, "grad_norm": 1.8643536567687988, "learning_rate": 0.0002, "epoch": 5.923482849604222, "step": 4490}, {"loss": 0.5858, "grad_norm": 1.2977997064590454, "learning_rate": 0.0002, "epoch": 5.936675461741425, "step": 4500}, {"loss": 0.4815, "grad_norm": 0.8366201519966125, "learning_rate": 0.0002, "epoch": 5.949868073878628, "step": 4510}, {"loss": 0.5126, "grad_norm": 1.0210131406784058, "learning_rate": 0.0002, "epoch": 5.963060686015831, "step": 4520}, {"loss": 0.5577, "grad_norm": 1.1287827491760254, "learning_rate": 0.0002, "epoch": 5.9762532981530345, "step": 4530}, {"loss": 0.5053, "grad_norm": 1.0480493307113647, "learning_rate": 0.0002, "epoch": 5.989445910290238, "step": 4540}]} +{"epoch": 7.0, "step": 5306, "epoch_duration": 2075.8860075473785, "total_accumulated_duration": 15233.741616010666, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9466, "grad_norm": 0.7545632123947144, "learning_rate": 0.0002, "epoch": 0.013192612137203167, "step": 10}, {"loss": 1.4909, "grad_norm": 0.5787661075592041, "learning_rate": 0.0002, "epoch": 0.026385224274406333, "step": 20}, {"loss": 1.3906, "grad_norm": 0.8616093993186951, "learning_rate": 0.0002, "epoch": 0.0395778364116095, "step": 30}, {"loss": 1.4116, "grad_norm": 0.42088547348976135, "learning_rate": 0.0002, "epoch": 0.052770448548812667, "step": 40}, {"loss": 1.3446, "grad_norm": 0.47704678773880005, "learning_rate": 0.0002, "epoch": 0.06596306068601583, "step": 50}, {"loss": 1.2476, "grad_norm": 0.5763994455337524, "learning_rate": 0.0002, "epoch": 0.079155672823219, "step": 60}, {"loss": 1.2268, "grad_norm": 0.4579846262931824, "learning_rate": 0.0002, "epoch": 0.09234828496042216, "step": 70}, {"loss": 1.415, "grad_norm": 0.46623846888542175, "learning_rate": 0.0002, "epoch": 0.10554089709762533, "step": 80}, {"loss": 1.2849, "grad_norm": 0.4206956624984741, "learning_rate": 0.0002, "epoch": 0.11873350923482849, "step": 90}, {"loss": 1.1608, "grad_norm": 0.41896629333496094, "learning_rate": 0.0002, "epoch": 0.13192612137203166, "step": 100}, {"loss": 1.2617, "grad_norm": 0.3459089398384094, "learning_rate": 0.0002, "epoch": 0.14511873350923482, "step": 110}, {"loss": 1.2858, "grad_norm": 0.4587327837944031, "learning_rate": 0.0002, "epoch": 0.158311345646438, "step": 120}, {"loss": 1.2241, "grad_norm": 0.433525413274765, "learning_rate": 0.0002, "epoch": 0.17150395778364116, "step": 130}, {"loss": 1.2269, "grad_norm": 0.39253175258636475, "learning_rate": 0.0002, "epoch": 0.18469656992084432, "step": 140}, {"loss": 1.2224, "grad_norm": 0.3602290749549866, "learning_rate": 0.0002, "epoch": 0.19788918205804748, "step": 150}, {"loss": 1.2834, "grad_norm": 0.41160839796066284, "learning_rate": 0.0002, "epoch": 0.21108179419525067, "step": 160}, {"loss": 1.1986, "grad_norm": 0.7213630080223083, "learning_rate": 0.0002, "epoch": 0.22427440633245382, "step": 170}, {"loss": 1.2215, "grad_norm": 0.39086055755615234, "learning_rate": 0.0002, "epoch": 0.23746701846965698, "step": 180}, {"loss": 1.3315, "grad_norm": 0.4465520977973938, "learning_rate": 0.0002, "epoch": 0.25065963060686014, "step": 190}, {"loss": 1.3798, "grad_norm": 1.814679741859436, "learning_rate": 0.0002, "epoch": 0.2638522427440633, "step": 200}, {"loss": 1.2259, "grad_norm": 0.5026423931121826, "learning_rate": 0.0002, "epoch": 0.2770448548812665, "step": 210}, {"loss": 1.3306, "grad_norm": 0.4156292974948883, "learning_rate": 0.0002, "epoch": 0.29023746701846964, "step": 220}, {"loss": 1.266, "grad_norm": 0.40813493728637695, "learning_rate": 0.0002, "epoch": 0.3034300791556728, "step": 230}, {"loss": 1.1533, "grad_norm": 0.3304787874221802, "learning_rate": 0.0002, "epoch": 0.316622691292876, "step": 240}, {"loss": 1.3154, "grad_norm": 0.46139976382255554, "learning_rate": 0.0002, "epoch": 0.32981530343007914, "step": 250}, {"loss": 1.1365, "grad_norm": 0.37518271803855896, "learning_rate": 0.0002, "epoch": 0.34300791556728233, "step": 260}, {"loss": 1.2004, "grad_norm": 0.35586467385292053, "learning_rate": 0.0002, "epoch": 0.3562005277044855, "step": 270}, {"loss": 1.2609, "grad_norm": 0.32441186904907227, "learning_rate": 0.0002, "epoch": 0.36939313984168864, "step": 280}, {"loss": 1.212, "grad_norm": 0.3198683261871338, "learning_rate": 0.0002, "epoch": 0.38258575197889183, "step": 290}, {"loss": 1.2465, "grad_norm": 0.33663108944892883, "learning_rate": 0.0002, "epoch": 0.39577836411609496, "step": 300}, {"loss": 1.1702, "grad_norm": 0.3711244761943817, "learning_rate": 0.0002, "epoch": 0.40897097625329815, "step": 310}, {"loss": 1.0871, "grad_norm": 0.3209651708602905, "learning_rate": 0.0002, "epoch": 0.42216358839050133, "step": 320}, {"loss": 1.2728, "grad_norm": 0.5152716040611267, "learning_rate": 0.0002, "epoch": 0.43535620052770446, "step": 330}, {"loss": 1.1833, "grad_norm": 0.5431376695632935, "learning_rate": 0.0002, "epoch": 0.44854881266490765, "step": 340}, {"loss": 1.1344, "grad_norm": 0.3069997727870941, "learning_rate": 0.0002, "epoch": 0.46174142480211083, "step": 350}, {"loss": 1.2742, "grad_norm": 0.34260064363479614, "learning_rate": 0.0002, "epoch": 0.47493403693931396, "step": 360}, {"loss": 1.3161, "grad_norm": 0.345653235912323, "learning_rate": 0.0002, "epoch": 0.48812664907651715, "step": 370}, {"loss": 1.1552, "grad_norm": 0.46222734451293945, "learning_rate": 0.0002, "epoch": 0.5013192612137203, "step": 380}, {"loss": 1.0616, "grad_norm": 0.27301734685897827, "learning_rate": 0.0002, "epoch": 0.5145118733509235, "step": 390}, {"loss": 1.1944, "grad_norm": 0.29048439860343933, "learning_rate": 0.0002, "epoch": 0.5277044854881267, "step": 400}, {"loss": 1.0956, "grad_norm": 0.32927802205085754, "learning_rate": 0.0002, "epoch": 0.5408970976253298, "step": 410}, {"loss": 1.2362, "grad_norm": 0.3336397409439087, "learning_rate": 0.0002, "epoch": 0.554089709762533, "step": 420}, {"loss": 1.1445, "grad_norm": 0.4007597267627716, "learning_rate": 0.0002, "epoch": 0.5672823218997362, "step": 430}, {"loss": 1.2731, "grad_norm": 0.36144956946372986, "learning_rate": 0.0002, "epoch": 0.5804749340369393, "step": 440}, {"loss": 1.1604, "grad_norm": 0.6331009864807129, "learning_rate": 0.0002, "epoch": 0.5936675461741425, "step": 450}, {"loss": 1.1692, "grad_norm": 0.41469088196754456, "learning_rate": 0.0002, "epoch": 0.6068601583113457, "step": 460}, {"loss": 1.2372, "grad_norm": 0.4388185143470764, "learning_rate": 0.0002, "epoch": 0.6200527704485488, "step": 470}, {"loss": 1.2541, "grad_norm": 0.3738141655921936, "learning_rate": 0.0002, "epoch": 0.633245382585752, "step": 480}, {"loss": 1.1265, "grad_norm": 0.7212023138999939, "learning_rate": 0.0002, "epoch": 0.6464379947229552, "step": 490}, {"loss": 1.2786, "grad_norm": 0.2972351014614105, "learning_rate": 0.0002, "epoch": 0.6596306068601583, "step": 500}, {"loss": 1.1739, "grad_norm": 0.45293179154396057, "learning_rate": 0.0002, "epoch": 0.6728232189973615, "step": 510}, {"loss": 1.255, "grad_norm": 0.4319860637187958, "learning_rate": 0.0002, "epoch": 0.6860158311345647, "step": 520}, {"loss": 1.0959, "grad_norm": 0.3050215542316437, "learning_rate": 0.0002, "epoch": 0.6992084432717678, "step": 530}, {"loss": 1.1608, "grad_norm": 0.3552611172199249, "learning_rate": 0.0002, "epoch": 0.712401055408971, "step": 540}, {"loss": 1.2546, "grad_norm": 0.3631151020526886, "learning_rate": 0.0002, "epoch": 0.7255936675461742, "step": 550}, {"loss": 1.1371, "grad_norm": 0.28177931904792786, "learning_rate": 0.0002, "epoch": 0.7387862796833773, "step": 560}, {"loss": 1.2081, "grad_norm": 0.359764039516449, "learning_rate": 0.0002, "epoch": 0.7519788918205804, "step": 570}, {"loss": 1.1356, "grad_norm": 0.3970327377319336, "learning_rate": 0.0002, "epoch": 0.7651715039577837, "step": 580}, {"loss": 1.309, "grad_norm": 0.3541001081466675, "learning_rate": 0.0002, "epoch": 0.7783641160949868, "step": 590}, {"loss": 1.2061, "grad_norm": 0.3478573262691498, "learning_rate": 0.0002, "epoch": 0.7915567282321899, "step": 600}, {"loss": 1.1864, "grad_norm": 0.3900321424007416, "learning_rate": 0.0002, "epoch": 0.8047493403693932, "step": 610}, {"loss": 1.1358, "grad_norm": 0.3443238437175751, "learning_rate": 0.0002, "epoch": 0.8179419525065963, "step": 620}, {"loss": 1.3232, "grad_norm": 0.44238781929016113, "learning_rate": 0.0002, "epoch": 0.8311345646437994, "step": 630}, {"loss": 1.1247, "grad_norm": 0.36339467763900757, "learning_rate": 0.0002, "epoch": 0.8443271767810027, "step": 640}, {"loss": 1.1049, "grad_norm": 0.6243070363998413, "learning_rate": 0.0002, "epoch": 0.8575197889182058, "step": 650}, {"loss": 1.1943, "grad_norm": 0.3209173381328583, "learning_rate": 0.0002, "epoch": 0.8707124010554089, "step": 660}, {"loss": 1.1927, "grad_norm": 0.35017991065979004, "learning_rate": 0.0002, "epoch": 0.8839050131926122, "step": 670}, {"loss": 1.1603, "grad_norm": 0.3247159421443939, "learning_rate": 0.0002, "epoch": 0.8970976253298153, "step": 680}, {"loss": 1.2298, "grad_norm": 0.4091894030570984, "learning_rate": 0.0002, "epoch": 0.9102902374670184, "step": 690}, {"loss": 1.2756, "grad_norm": 0.3975585997104645, "learning_rate": 0.0002, "epoch": 0.9234828496042217, "step": 700}, {"loss": 1.281, "grad_norm": 0.3666245937347412, "learning_rate": 0.0002, "epoch": 0.9366754617414248, "step": 710}, {"loss": 1.2855, "grad_norm": 0.45216917991638184, "learning_rate": 0.0002, "epoch": 0.9498680738786279, "step": 720}, {"loss": 1.2217, "grad_norm": 0.36108118295669556, "learning_rate": 0.0002, "epoch": 0.9630606860158312, "step": 730}, {"loss": 1.17, "grad_norm": 0.44550251960754395, "learning_rate": 0.0002, "epoch": 0.9762532981530343, "step": 740}, {"loss": 1.0672, "grad_norm": 0.29801255464553833, "learning_rate": 0.0002, "epoch": 0.9894459102902374, "step": 750}, {"eval_loss": 1.203244686126709, "eval_runtime": 76.0457, "eval_samples_per_second": 5.668, "eval_steps_per_second": 0.71, "epoch": 1.0, "step": 758}, {"loss": 1.0748, "grad_norm": 0.4096551239490509, "learning_rate": 0.0002, "epoch": 1.0026385224274406, "step": 760}, {"loss": 1.1537, "grad_norm": 0.2649582326412201, "learning_rate": 0.0002, "epoch": 1.0158311345646438, "step": 770}, {"loss": 1.1676, "grad_norm": 0.3100722134113312, "learning_rate": 0.0002, "epoch": 1.029023746701847, "step": 780}, {"loss": 1.1736, "grad_norm": 0.3911755383014679, "learning_rate": 0.0002, "epoch": 1.04221635883905, "step": 790}, {"loss": 1.0354, "grad_norm": 0.4600953757762909, "learning_rate": 0.0002, "epoch": 1.0554089709762533, "step": 800}, {"loss": 1.2869, "grad_norm": 0.28671619296073914, "learning_rate": 0.0002, "epoch": 1.0686015831134565, "step": 810}, {"loss": 1.1441, "grad_norm": 0.47282642126083374, "learning_rate": 0.0002, "epoch": 1.0817941952506596, "step": 820}, {"loss": 1.1368, "grad_norm": 0.690073549747467, "learning_rate": 0.0002, "epoch": 1.0949868073878628, "step": 830}, {"loss": 0.9944, "grad_norm": 0.7317902445793152, "learning_rate": 0.0002, "epoch": 1.108179419525066, "step": 840}, {"loss": 1.108, "grad_norm": 0.44215938448905945, "learning_rate": 0.0002, "epoch": 1.121372031662269, "step": 850}, {"loss": 1.0558, "grad_norm": 0.33875149488449097, "learning_rate": 0.0002, "epoch": 1.1345646437994723, "step": 860}, {"loss": 1.0471, "grad_norm": 0.3700002431869507, "learning_rate": 0.0002, "epoch": 1.1477572559366755, "step": 870}, {"loss": 1.0121, "grad_norm": 0.41173291206359863, "learning_rate": 0.0002, "epoch": 1.1609498680738786, "step": 880}, {"loss": 1.0312, "grad_norm": 0.5253589749336243, "learning_rate": 0.0002, "epoch": 1.1741424802110818, "step": 890}, {"loss": 1.2238, "grad_norm": 0.3912237286567688, "learning_rate": 0.0002, "epoch": 1.187335092348285, "step": 900}, {"loss": 0.9434, "grad_norm": 0.40990331768989563, "learning_rate": 0.0002, "epoch": 1.200527704485488, "step": 910}, {"loss": 1.0493, "grad_norm": 0.40377968549728394, "learning_rate": 0.0002, "epoch": 1.2137203166226913, "step": 920}, {"loss": 1.1496, "grad_norm": 0.4605846405029297, "learning_rate": 0.0002, "epoch": 1.2269129287598945, "step": 930}, {"loss": 1.1173, "grad_norm": 0.31564897298812866, "learning_rate": 0.0002, "epoch": 1.2401055408970976, "step": 940}, {"loss": 1.0547, "grad_norm": 0.39808550477027893, "learning_rate": 0.0002, "epoch": 1.2532981530343008, "step": 950}, {"loss": 1.1367, "grad_norm": 0.3762115240097046, "learning_rate": 0.0002, "epoch": 1.266490765171504, "step": 960}, {"loss": 1.1596, "grad_norm": 0.4174984097480774, "learning_rate": 0.0002, "epoch": 1.279683377308707, "step": 970}, {"loss": 1.1327, "grad_norm": 0.5263054966926575, "learning_rate": 0.0002, "epoch": 1.2928759894459103, "step": 980}, {"loss": 1.0339, "grad_norm": 0.41673699021339417, "learning_rate": 0.0002, "epoch": 1.3060686015831133, "step": 990}, {"loss": 1.1198, "grad_norm": 0.9613684415817261, "learning_rate": 0.0002, "epoch": 1.3192612137203166, "step": 1000}, {"loss": 1.0444, "grad_norm": 0.3690216839313507, "learning_rate": 0.0002, "epoch": 1.3324538258575198, "step": 1010}, {"loss": 1.0473, "grad_norm": 0.521821141242981, "learning_rate": 0.0002, "epoch": 1.345646437994723, "step": 1020}, {"loss": 1.1065, "grad_norm": 0.3353094160556793, "learning_rate": 0.0002, "epoch": 1.358839050131926, "step": 1030}, {"loss": 1.1286, "grad_norm": 0.3843843936920166, "learning_rate": 0.0002, "epoch": 1.3720316622691293, "step": 1040}, {"loss": 1.1369, "grad_norm": 0.372514545917511, "learning_rate": 0.0002, "epoch": 1.3852242744063323, "step": 1050}, {"loss": 1.0041, "grad_norm": 0.34537771344184875, "learning_rate": 0.0002, "epoch": 1.3984168865435356, "step": 1060}, {"loss": 1.0251, "grad_norm": 0.45349085330963135, "learning_rate": 0.0002, "epoch": 1.4116094986807388, "step": 1070}, {"loss": 1.1873, "grad_norm": 0.5120177268981934, "learning_rate": 0.0002, "epoch": 1.424802110817942, "step": 1080}, {"loss": 1.0872, "grad_norm": 0.42800238728523254, "learning_rate": 0.0002, "epoch": 1.437994722955145, "step": 1090}, {"loss": 1.0734, "grad_norm": 0.343832790851593, "learning_rate": 0.0002, "epoch": 1.4511873350923483, "step": 1100}, {"loss": 1.1286, "grad_norm": 0.3829841911792755, "learning_rate": 0.0002, "epoch": 1.4643799472295513, "step": 1110}, {"loss": 1.1268, "grad_norm": 0.4289931058883667, "learning_rate": 0.0002, "epoch": 1.4775725593667546, "step": 1120}, {"loss": 1.0676, "grad_norm": 0.42750850319862366, "learning_rate": 0.0002, "epoch": 1.4907651715039578, "step": 1130}, {"loss": 1.072, "grad_norm": 0.34328413009643555, "learning_rate": 0.0002, "epoch": 1.503957783641161, "step": 1140}, {"loss": 1.0863, "grad_norm": 0.349096417427063, "learning_rate": 0.0002, "epoch": 1.517150395778364, "step": 1150}, {"loss": 1.1462, "grad_norm": 0.7700717449188232, "learning_rate": 0.0002, "epoch": 1.5303430079155673, "step": 1160}, {"loss": 0.9528, "grad_norm": 0.39294949173927307, "learning_rate": 0.0002, "epoch": 1.5435356200527703, "step": 1170}, {"loss": 1.1603, "grad_norm": 0.36173608899116516, "learning_rate": 0.0002, "epoch": 1.5567282321899736, "step": 1180}, {"loss": 1.1508, "grad_norm": 0.6034277677536011, "learning_rate": 0.0002, "epoch": 1.5699208443271768, "step": 1190}, {"loss": 1.0105, "grad_norm": 0.36694103479385376, "learning_rate": 0.0002, "epoch": 1.58311345646438, "step": 1200}, {"loss": 1.1479, "grad_norm": 0.4727209508419037, "learning_rate": 0.0002, "epoch": 1.596306068601583, "step": 1210}, {"loss": 1.0689, "grad_norm": 0.6482883095741272, "learning_rate": 0.0002, "epoch": 1.6094986807387863, "step": 1220}, {"loss": 1.1405, "grad_norm": 0.5238035917282104, "learning_rate": 0.0002, "epoch": 1.6226912928759893, "step": 1230}, {"loss": 1.0596, "grad_norm": 0.4812222421169281, "learning_rate": 0.0002, "epoch": 1.6358839050131926, "step": 1240}, {"loss": 1.2729, "grad_norm": 0.7131702303886414, "learning_rate": 0.0002, "epoch": 1.6490765171503958, "step": 1250}, {"loss": 0.9832, "grad_norm": 0.3803327977657318, "learning_rate": 0.0002, "epoch": 1.662269129287599, "step": 1260}, {"loss": 1.1433, "grad_norm": 0.3745088577270508, "learning_rate": 0.0002, "epoch": 1.675461741424802, "step": 1270}, {"loss": 1.1018, "grad_norm": 0.4427378475666046, "learning_rate": 0.0002, "epoch": 1.6886543535620053, "step": 1280}, {"loss": 1.0619, "grad_norm": 0.797478973865509, "learning_rate": 0.0002, "epoch": 1.7018469656992083, "step": 1290}, {"loss": 1.0853, "grad_norm": 0.503620982170105, "learning_rate": 0.0002, "epoch": 1.7150395778364116, "step": 1300}, {"loss": 1.1324, "grad_norm": 0.4132426381111145, "learning_rate": 0.0002, "epoch": 1.7282321899736148, "step": 1310}, {"loss": 1.172, "grad_norm": 0.41811656951904297, "learning_rate": 0.0002, "epoch": 1.741424802110818, "step": 1320}, {"loss": 1.0903, "grad_norm": 0.40647849440574646, "learning_rate": 0.0002, "epoch": 1.754617414248021, "step": 1330}, {"loss": 1.1316, "grad_norm": 0.42138347029685974, "learning_rate": 0.0002, "epoch": 1.767810026385224, "step": 1340}, {"loss": 0.9955, "grad_norm": 0.46523579955101013, "learning_rate": 0.0002, "epoch": 1.7810026385224274, "step": 1350}, {"loss": 1.0307, "grad_norm": 0.39760419726371765, "learning_rate": 0.0002, "epoch": 1.7941952506596306, "step": 1360}, {"loss": 1.1218, "grad_norm": 0.37993717193603516, "learning_rate": 0.0002, "epoch": 1.8073878627968338, "step": 1370}, {"loss": 1.0921, "grad_norm": 0.5404181480407715, "learning_rate": 0.0002, "epoch": 1.820580474934037, "step": 1380}, {"loss": 1.0903, "grad_norm": 0.4385245740413666, "learning_rate": 0.0002, "epoch": 1.83377308707124, "step": 1390}, {"loss": 1.1175, "grad_norm": 0.529797375202179, "learning_rate": 0.0002, "epoch": 1.8469656992084431, "step": 1400}, {"loss": 1.0577, "grad_norm": 0.481567919254303, "learning_rate": 0.0002, "epoch": 1.8601583113456464, "step": 1410}, {"loss": 1.0581, "grad_norm": 0.34787362813949585, "learning_rate": 0.0002, "epoch": 1.8733509234828496, "step": 1420}, {"loss": 1.0536, "grad_norm": 0.6402362585067749, "learning_rate": 0.0002, "epoch": 1.8865435356200528, "step": 1430}, {"loss": 1.0787, "grad_norm": 0.3461322784423828, "learning_rate": 0.0002, "epoch": 1.899736147757256, "step": 1440}, {"loss": 1.0925, "grad_norm": 0.44005653262138367, "learning_rate": 0.0002, "epoch": 1.912928759894459, "step": 1450}, {"loss": 1.0414, "grad_norm": 0.4064280688762665, "learning_rate": 0.0002, "epoch": 1.9261213720316621, "step": 1460}, {"loss": 1.0608, "grad_norm": 0.5236523151397705, "learning_rate": 0.0002, "epoch": 1.9393139841688654, "step": 1470}, {"loss": 1.0572, "grad_norm": 0.41030219197273254, "learning_rate": 0.0002, "epoch": 1.9525065963060686, "step": 1480}, {"loss": 1.1204, "grad_norm": 0.39805835485458374, "learning_rate": 0.0002, "epoch": 1.9656992084432718, "step": 1490}, {"loss": 1.0364, "grad_norm": 0.42974501848220825, "learning_rate": 0.0002, "epoch": 1.978891820580475, "step": 1500}, {"loss": 1.1169, "grad_norm": 0.4688243865966797, "learning_rate": 0.0002, "epoch": 1.992084432717678, "step": 1510}, {"eval_loss": 1.1874967813491821, "eval_runtime": 71.9523, "eval_samples_per_second": 5.99, "eval_steps_per_second": 0.75, "epoch": 2.0, "step": 1516}, {"loss": 1.0401, "grad_norm": 0.4121631383895874, "learning_rate": 0.0002, "epoch": 2.005277044854881, "step": 1520}, {"loss": 0.9384, "grad_norm": 0.4844197928905487, "learning_rate": 0.0002, "epoch": 2.0184696569920844, "step": 1530}, {"loss": 0.8686, "grad_norm": 0.45408546924591064, "learning_rate": 0.0002, "epoch": 2.0316622691292876, "step": 1540}, {"loss": 1.0399, "grad_norm": 0.48662951588630676, "learning_rate": 0.0002, "epoch": 2.044854881266491, "step": 1550}, {"loss": 0.936, "grad_norm": 0.7195899486541748, "learning_rate": 0.0002, "epoch": 2.058047493403694, "step": 1560}, {"loss": 0.9486, "grad_norm": 0.5071077346801758, "learning_rate": 0.0002, "epoch": 2.0712401055408973, "step": 1570}, {"loss": 1.0055, "grad_norm": 0.7473958730697632, "learning_rate": 0.0002, "epoch": 2.0844327176781, "step": 1580}, {"loss": 0.8309, "grad_norm": 0.5509232878684998, "learning_rate": 0.0002, "epoch": 2.0976253298153034, "step": 1590}, {"loss": 0.9181, "grad_norm": 0.5108042359352112, "learning_rate": 0.0002, "epoch": 2.1108179419525066, "step": 1600}, {"loss": 0.9499, "grad_norm": 0.42331448197364807, "learning_rate": 0.0002, "epoch": 2.12401055408971, "step": 1610}, {"loss": 0.9359, "grad_norm": 0.46621623635292053, "learning_rate": 0.0002, "epoch": 2.137203166226913, "step": 1620}, {"loss": 0.9065, "grad_norm": 0.43802836537361145, "learning_rate": 0.0002, "epoch": 2.150395778364116, "step": 1630}, {"loss": 0.9375, "grad_norm": 0.49908021092414856, "learning_rate": 0.0002, "epoch": 2.163588390501319, "step": 1640}, {"loss": 1.0389, "grad_norm": 0.4195636808872223, "learning_rate": 0.0002, "epoch": 2.1767810026385224, "step": 1650}, {"loss": 0.8501, "grad_norm": 0.49515822529792786, "learning_rate": 0.0002, "epoch": 2.1899736147757256, "step": 1660}, {"loss": 1.0557, "grad_norm": 0.4607589542865753, "learning_rate": 0.0002, "epoch": 2.203166226912929, "step": 1670}, {"loss": 1.0206, "grad_norm": 0.4489196836948395, "learning_rate": 0.0002, "epoch": 2.216358839050132, "step": 1680}, {"loss": 0.9657, "grad_norm": 0.49300864338874817, "learning_rate": 0.0002, "epoch": 2.229551451187335, "step": 1690}, {"loss": 0.8371, "grad_norm": 0.6624954342842102, "learning_rate": 0.0002, "epoch": 2.242744063324538, "step": 1700}, {"loss": 0.8555, "grad_norm": 0.8391500115394592, "learning_rate": 0.0002, "epoch": 2.2559366754617414, "step": 1710}, {"loss": 1.0113, "grad_norm": 0.5193073749542236, "learning_rate": 0.0002, "epoch": 2.2691292875989446, "step": 1720}, {"loss": 0.9979, "grad_norm": 0.6180613040924072, "learning_rate": 0.0002, "epoch": 2.282321899736148, "step": 1730}, {"loss": 0.9579, "grad_norm": 0.591191291809082, "learning_rate": 0.0002, "epoch": 2.295514511873351, "step": 1740}, {"loss": 0.8879, "grad_norm": 0.546897828578949, "learning_rate": 0.0002, "epoch": 2.308707124010554, "step": 1750}, {"loss": 0.9321, "grad_norm": 0.5470401644706726, "learning_rate": 0.0002, "epoch": 2.321899736147757, "step": 1760}, {"loss": 0.9104, "grad_norm": 0.4590282738208771, "learning_rate": 0.0002, "epoch": 2.3350923482849604, "step": 1770}, {"loss": 1.0384, "grad_norm": 0.622164785861969, "learning_rate": 0.0002, "epoch": 2.3482849604221636, "step": 1780}, {"loss": 1.0297, "grad_norm": 0.5753812193870544, "learning_rate": 0.0002, "epoch": 2.361477572559367, "step": 1790}, {"loss": 0.9569, "grad_norm": 0.47958624362945557, "learning_rate": 0.0002, "epoch": 2.37467018469657, "step": 1800}, {"loss": 1.0519, "grad_norm": 0.48042672872543335, "learning_rate": 0.0002, "epoch": 2.387862796833773, "step": 1810}, {"loss": 0.9042, "grad_norm": 0.44586366415023804, "learning_rate": 0.0002, "epoch": 2.401055408970976, "step": 1820}, {"loss": 0.9783, "grad_norm": 0.7239416837692261, "learning_rate": 0.0002, "epoch": 2.4142480211081794, "step": 1830}, {"loss": 0.8818, "grad_norm": 0.5515341758728027, "learning_rate": 0.0002, "epoch": 2.4274406332453826, "step": 1840}, {"loss": 0.9503, "grad_norm": 0.6280064582824707, "learning_rate": 0.0002, "epoch": 2.440633245382586, "step": 1850}, {"loss": 0.8943, "grad_norm": 0.4832057058811188, "learning_rate": 0.0002, "epoch": 2.453825857519789, "step": 1860}, {"loss": 0.8744, "grad_norm": 0.5789321064949036, "learning_rate": 0.0002, "epoch": 2.467018469656992, "step": 1870}, {"loss": 0.9332, "grad_norm": 0.48491886258125305, "learning_rate": 0.0002, "epoch": 2.480211081794195, "step": 1880}, {"loss": 0.943, "grad_norm": 0.532365620136261, "learning_rate": 0.0002, "epoch": 2.4934036939313984, "step": 1890}, {"loss": 0.9734, "grad_norm": 0.7087852954864502, "learning_rate": 0.0002, "epoch": 2.5065963060686016, "step": 1900}, {"loss": 0.9767, "grad_norm": 0.48157402873039246, "learning_rate": 0.0002, "epoch": 2.519788918205805, "step": 1910}, {"loss": 0.9851, "grad_norm": 0.5886041522026062, "learning_rate": 0.0002, "epoch": 2.532981530343008, "step": 1920}, {"loss": 1.0144, "grad_norm": 0.6332622766494751, "learning_rate": 0.0002, "epoch": 2.5461741424802113, "step": 1930}, {"loss": 0.9516, "grad_norm": 0.5463117957115173, "learning_rate": 0.0002, "epoch": 2.559366754617414, "step": 1940}, {"loss": 0.9373, "grad_norm": 0.5432228446006775, "learning_rate": 0.0002, "epoch": 2.5725593667546174, "step": 1950}, {"loss": 0.8974, "grad_norm": 0.5929186940193176, "learning_rate": 0.0002, "epoch": 2.5857519788918206, "step": 1960}, {"loss": 1.0062, "grad_norm": 0.5120641589164734, "learning_rate": 0.0002, "epoch": 2.598944591029024, "step": 1970}, {"loss": 0.9143, "grad_norm": 0.5372339487075806, "learning_rate": 0.0002, "epoch": 2.6121372031662267, "step": 1980}, {"loss": 1.0786, "grad_norm": 0.5519838929176331, "learning_rate": 0.0002, "epoch": 2.62532981530343, "step": 1990}, {"loss": 1.021, "grad_norm": 0.7304037809371948, "learning_rate": 0.0002, "epoch": 2.638522427440633, "step": 2000}, {"loss": 0.8708, "grad_norm": 0.6182340979576111, "learning_rate": 0.0002, "epoch": 2.6517150395778364, "step": 2010}, {"loss": 1.0501, "grad_norm": 0.4874444305896759, "learning_rate": 0.0002, "epoch": 2.6649076517150396, "step": 2020}, {"loss": 0.9612, "grad_norm": 0.5850239396095276, "learning_rate": 0.0002, "epoch": 2.678100263852243, "step": 2030}, {"loss": 0.9808, "grad_norm": 0.6495311856269836, "learning_rate": 0.0002, "epoch": 2.691292875989446, "step": 2040}, {"loss": 1.0187, "grad_norm": 1.002830147743225, "learning_rate": 0.0002, "epoch": 2.7044854881266494, "step": 2050}, {"loss": 1.0386, "grad_norm": 0.49076753854751587, "learning_rate": 0.0002, "epoch": 2.717678100263852, "step": 2060}, {"loss": 0.9416, "grad_norm": 0.4736326336860657, "learning_rate": 0.0002, "epoch": 2.7308707124010554, "step": 2070}, {"loss": 0.8424, "grad_norm": 0.5527601838111877, "learning_rate": 0.0002, "epoch": 2.7440633245382586, "step": 2080}, {"loss": 0.9149, "grad_norm": 0.7295718193054199, "learning_rate": 0.0002, "epoch": 2.757255936675462, "step": 2090}, {"loss": 0.9032, "grad_norm": 0.5437536835670471, "learning_rate": 0.0002, "epoch": 2.7704485488126647, "step": 2100}, {"loss": 0.8254, "grad_norm": 0.5997128486633301, "learning_rate": 0.0002, "epoch": 2.783641160949868, "step": 2110}, {"loss": 0.976, "grad_norm": 0.6498191356658936, "learning_rate": 0.0002, "epoch": 2.796833773087071, "step": 2120}, {"loss": 0.9543, "grad_norm": 0.5237268805503845, "learning_rate": 0.0002, "epoch": 2.8100263852242744, "step": 2130}, {"loss": 0.9302, "grad_norm": 0.6033027172088623, "learning_rate": 0.0002, "epoch": 2.8232189973614776, "step": 2140}, {"loss": 0.9625, "grad_norm": 0.6077138781547546, "learning_rate": 0.0002, "epoch": 2.836411609498681, "step": 2150}, {"loss": 0.9347, "grad_norm": 0.4127797484397888, "learning_rate": 0.0002, "epoch": 2.849604221635884, "step": 2160}, {"loss": 1.0459, "grad_norm": 0.8448635339736938, "learning_rate": 0.0002, "epoch": 2.862796833773087, "step": 2170}, {"loss": 0.8185, "grad_norm": 0.5669729113578796, "learning_rate": 0.0002, "epoch": 2.87598944591029, "step": 2180}, {"loss": 0.8555, "grad_norm": 0.510231077671051, "learning_rate": 0.0002, "epoch": 2.8891820580474934, "step": 2190}, {"loss": 0.9267, "grad_norm": 0.8072245121002197, "learning_rate": 0.0002, "epoch": 2.9023746701846966, "step": 2200}, {"loss": 0.9685, "grad_norm": 0.6055923104286194, "learning_rate": 0.0002, "epoch": 2.9155672823219, "step": 2210}, {"loss": 0.9157, "grad_norm": 0.7384416460990906, "learning_rate": 0.0002, "epoch": 2.9287598944591027, "step": 2220}, {"loss": 0.9712, "grad_norm": 0.4922751784324646, "learning_rate": 0.0002, "epoch": 2.941952506596306, "step": 2230}, {"loss": 0.9487, "grad_norm": 0.6039906740188599, "learning_rate": 0.0002, "epoch": 2.955145118733509, "step": 2240}, {"loss": 0.9776, "grad_norm": 0.4751701354980469, "learning_rate": 0.0002, "epoch": 2.9683377308707124, "step": 2250}, {"loss": 1.0619, "grad_norm": 0.5698353052139282, "learning_rate": 0.0002, "epoch": 2.9815303430079156, "step": 2260}, {"loss": 1.1184, "grad_norm": 0.893563449382782, "learning_rate": 0.0002, "epoch": 2.994722955145119, "step": 2270}, {"eval_loss": 1.2046419382095337, "eval_runtime": 71.5992, "eval_samples_per_second": 6.02, "eval_steps_per_second": 0.754, "epoch": 3.0, "step": 2274}, {"loss": 0.8269, "grad_norm": 0.41119325160980225, "learning_rate": 0.0002, "epoch": 3.007915567282322, "step": 2280}, {"loss": 0.7856, "grad_norm": 0.8169420957565308, "learning_rate": 0.0002, "epoch": 3.021108179419525, "step": 2290}, {"loss": 0.794, "grad_norm": 0.6033818125724792, "learning_rate": 0.0002, "epoch": 3.034300791556728, "step": 2300}, {"loss": 0.7607, "grad_norm": 0.9600058197975159, "learning_rate": 0.0002, "epoch": 3.0474934036939314, "step": 2310}, {"loss": 0.8353, "grad_norm": 0.5859250426292419, "learning_rate": 0.0002, "epoch": 3.0606860158311346, "step": 2320}, {"loss": 0.7598, "grad_norm": 0.6758618950843811, "learning_rate": 0.0002, "epoch": 3.073878627968338, "step": 2330}, {"loss": 0.7631, "grad_norm": 0.8407140970230103, "learning_rate": 0.0002, "epoch": 3.0870712401055407, "step": 2340}, {"loss": 0.7664, "grad_norm": 0.767779529094696, "learning_rate": 0.0002, "epoch": 3.100263852242744, "step": 2350}, {"loss": 0.7121, "grad_norm": 0.5572896599769592, "learning_rate": 0.0002, "epoch": 3.113456464379947, "step": 2360}, {"loss": 0.7419, "grad_norm": 0.5908368825912476, "learning_rate": 0.0002, "epoch": 3.1266490765171504, "step": 2370}, {"loss": 0.8024, "grad_norm": 0.8047826290130615, "learning_rate": 0.0002, "epoch": 3.1398416886543536, "step": 2380}, {"loss": 0.8686, "grad_norm": 0.8041718006134033, "learning_rate": 0.0002, "epoch": 3.153034300791557, "step": 2390}, {"loss": 0.668, "grad_norm": 0.57078617811203, "learning_rate": 0.0002, "epoch": 3.16622691292876, "step": 2400}, {"loss": 0.7976, "grad_norm": 0.5125322937965393, "learning_rate": 0.0002, "epoch": 3.179419525065963, "step": 2410}, {"loss": 0.741, "grad_norm": 0.6356934309005737, "learning_rate": 0.0002, "epoch": 3.192612137203166, "step": 2420}, {"loss": 0.687, "grad_norm": 1.0129680633544922, "learning_rate": 0.0002, "epoch": 3.2058047493403694, "step": 2430}, {"loss": 0.8316, "grad_norm": 0.8104226589202881, "learning_rate": 0.0002, "epoch": 3.2189973614775726, "step": 2440}, {"loss": 0.8343, "grad_norm": 0.7276079058647156, "learning_rate": 0.0002, "epoch": 3.232189973614776, "step": 2450}, {"loss": 0.8183, "grad_norm": 0.9753884077072144, "learning_rate": 0.0002, "epoch": 3.2453825857519787, "step": 2460}, {"loss": 0.7776, "grad_norm": 0.9753183722496033, "learning_rate": 0.0002, "epoch": 3.258575197889182, "step": 2470}, {"loss": 0.8815, "grad_norm": 0.6791225075721741, "learning_rate": 0.0002, "epoch": 3.271767810026385, "step": 2480}, {"loss": 0.7548, "grad_norm": 0.6797150373458862, "learning_rate": 0.0002, "epoch": 3.2849604221635884, "step": 2490}, {"loss": 0.8395, "grad_norm": 0.8107194900512695, "learning_rate": 0.0002, "epoch": 3.2981530343007917, "step": 2500}, {"loss": 0.7869, "grad_norm": 0.5878375172615051, "learning_rate": 0.0002, "epoch": 3.311345646437995, "step": 2510}, {"loss": 0.7992, "grad_norm": 0.5882975459098816, "learning_rate": 0.0002, "epoch": 3.324538258575198, "step": 2520}, {"loss": 0.7472, "grad_norm": 0.6180013418197632, "learning_rate": 0.0002, "epoch": 3.337730870712401, "step": 2530}, {"loss": 0.8033, "grad_norm": 1.0008151531219482, "learning_rate": 0.0002, "epoch": 3.350923482849604, "step": 2540}, {"loss": 0.8464, "grad_norm": 0.6404656767845154, "learning_rate": 0.0002, "epoch": 3.3641160949868074, "step": 2550}, {"loss": 0.7533, "grad_norm": 0.8481354117393494, "learning_rate": 0.0002, "epoch": 3.3773087071240107, "step": 2560}, {"loss": 0.7852, "grad_norm": 0.8068035244941711, "learning_rate": 0.0002, "epoch": 3.390501319261214, "step": 2570}, {"loss": 0.8621, "grad_norm": 0.7477166056632996, "learning_rate": 0.0002, "epoch": 3.4036939313984167, "step": 2580}, {"loss": 0.8352, "grad_norm": 0.6202635765075684, "learning_rate": 0.0002, "epoch": 3.41688654353562, "step": 2590}, {"loss": 0.7572, "grad_norm": 0.6981159448623657, "learning_rate": 0.0002, "epoch": 3.430079155672823, "step": 2600}, {"loss": 0.7846, "grad_norm": 0.6611084342002869, "learning_rate": 0.0002, "epoch": 3.4432717678100264, "step": 2610}, {"loss": 0.7503, "grad_norm": 0.5727696418762207, "learning_rate": 0.0002, "epoch": 3.4564643799472297, "step": 2620}, {"loss": 0.8427, "grad_norm": 1.2354545593261719, "learning_rate": 0.0002, "epoch": 3.469656992084433, "step": 2630}, {"loss": 0.7747, "grad_norm": 0.6347638368606567, "learning_rate": 0.0002, "epoch": 3.4828496042216357, "step": 2640}, {"loss": 0.8426, "grad_norm": 0.6975704431533813, "learning_rate": 0.0002, "epoch": 3.496042216358839, "step": 2650}, {"loss": 0.8773, "grad_norm": 0.6569573879241943, "learning_rate": 0.0002, "epoch": 3.509234828496042, "step": 2660}, {"loss": 0.7908, "grad_norm": 0.6979609131813049, "learning_rate": 0.0002, "epoch": 3.5224274406332454, "step": 2670}, {"loss": 0.8254, "grad_norm": 0.6287988424301147, "learning_rate": 0.0002, "epoch": 3.5356200527704487, "step": 2680}, {"loss": 0.7815, "grad_norm": 0.8682637214660645, "learning_rate": 0.0002, "epoch": 3.5488126649076515, "step": 2690}, {"loss": 0.7566, "grad_norm": 0.7062831521034241, "learning_rate": 0.0002, "epoch": 3.5620052770448547, "step": 2700}, {"loss": 0.713, "grad_norm": 1.0061452388763428, "learning_rate": 0.0002, "epoch": 3.575197889182058, "step": 2710}, {"loss": 0.7738, "grad_norm": 0.719097375869751, "learning_rate": 0.0002, "epoch": 3.588390501319261, "step": 2720}, {"loss": 0.8145, "grad_norm": 0.7583496570587158, "learning_rate": 0.0002, "epoch": 3.6015831134564644, "step": 2730}, {"loss": 0.91, "grad_norm": 0.7543531060218811, "learning_rate": 0.0002, "epoch": 3.6147757255936677, "step": 2740}, {"loss": 0.8325, "grad_norm": 0.8873646855354309, "learning_rate": 0.0002, "epoch": 3.627968337730871, "step": 2750}, {"loss": 0.7116, "grad_norm": 1.0657562017440796, "learning_rate": 0.0002, "epoch": 3.641160949868074, "step": 2760}, {"loss": 0.8291, "grad_norm": 0.8641113638877869, "learning_rate": 0.0002, "epoch": 3.654353562005277, "step": 2770}, {"loss": 0.8302, "grad_norm": 0.6620645523071289, "learning_rate": 0.0002, "epoch": 3.66754617414248, "step": 2780}, {"loss": 0.8261, "grad_norm": 0.6919541954994202, "learning_rate": 0.0002, "epoch": 3.6807387862796834, "step": 2790}, {"loss": 0.8388, "grad_norm": 0.7305743098258972, "learning_rate": 0.0002, "epoch": 3.6939313984168867, "step": 2800}, {"loss": 0.8053, "grad_norm": 0.7464777827262878, "learning_rate": 0.0002, "epoch": 3.7071240105540895, "step": 2810}, {"loss": 0.8019, "grad_norm": 0.8067063093185425, "learning_rate": 0.0002, "epoch": 3.7203166226912927, "step": 2820}, {"loss": 0.8259, "grad_norm": 0.7789416313171387, "learning_rate": 0.0002, "epoch": 3.733509234828496, "step": 2830}, {"loss": 0.774, "grad_norm": 0.507529079914093, "learning_rate": 0.0002, "epoch": 3.746701846965699, "step": 2840}, {"loss": 0.832, "grad_norm": 0.6509260535240173, "learning_rate": 0.0002, "epoch": 3.7598944591029024, "step": 2850}, {"loss": 0.8257, "grad_norm": 0.9141367673873901, "learning_rate": 0.0002, "epoch": 3.7730870712401057, "step": 2860}, {"loss": 0.9436, "grad_norm": 0.7852635979652405, "learning_rate": 0.0002, "epoch": 3.786279683377309, "step": 2870}, {"loss": 0.8842, "grad_norm": 0.5340318083763123, "learning_rate": 0.0002, "epoch": 3.7994722955145117, "step": 2880}, {"loss": 0.7468, "grad_norm": 0.6246042847633362, "learning_rate": 0.0002, "epoch": 3.812664907651715, "step": 2890}, {"loss": 0.8184, "grad_norm": 0.7064066529273987, "learning_rate": 0.0002, "epoch": 3.825857519788918, "step": 2900}, {"loss": 0.8515, "grad_norm": 0.6144065856933594, "learning_rate": 0.0002, "epoch": 3.8390501319261214, "step": 2910}, {"loss": 0.7484, "grad_norm": 0.5268424153327942, "learning_rate": 0.0002, "epoch": 3.8522427440633247, "step": 2920}, {"loss": 0.7594, "grad_norm": 0.9508116841316223, "learning_rate": 0.0002, "epoch": 3.8654353562005275, "step": 2930}, {"loss": 0.8437, "grad_norm": 0.9133715629577637, "learning_rate": 0.0002, "epoch": 3.8786279683377307, "step": 2940}, {"loss": 0.8611, "grad_norm": 1.0144646167755127, "learning_rate": 0.0002, "epoch": 3.891820580474934, "step": 2950}, {"loss": 0.8043, "grad_norm": 0.6397877931594849, "learning_rate": 0.0002, "epoch": 3.905013192612137, "step": 2960}, {"loss": 0.8285, "grad_norm": 0.734835147857666, "learning_rate": 0.0002, "epoch": 3.9182058047493404, "step": 2970}, {"loss": 0.7831, "grad_norm": 0.784853994846344, "learning_rate": 0.0002, "epoch": 3.9313984168865437, "step": 2980}, {"loss": 0.8148, "grad_norm": 0.805831789970398, "learning_rate": 0.0002, "epoch": 3.944591029023747, "step": 2990}, {"loss": 0.8252, "grad_norm": 0.6299595236778259, "learning_rate": 0.0002, "epoch": 3.9577836411609497, "step": 3000}, {"loss": 0.8244, "grad_norm": 0.6264058351516724, "learning_rate": 0.0002, "epoch": 3.970976253298153, "step": 3010}, {"loss": 0.8185, "grad_norm": 0.6419739723205566, "learning_rate": 0.0002, "epoch": 3.984168865435356, "step": 3020}, {"loss": 0.8174, "grad_norm": 0.7737036943435669, "learning_rate": 0.0002, "epoch": 3.9973614775725594, "step": 3030}, {"eval_loss": 1.2454297542572021, "eval_runtime": 71.8558, "eval_samples_per_second": 5.998, "eval_steps_per_second": 0.752, "epoch": 4.0, "step": 3032}, {"loss": 0.6716, "grad_norm": 1.092727541923523, "learning_rate": 0.0002, "epoch": 4.010554089709762, "step": 3040}, {"loss": 0.596, "grad_norm": 0.8087759613990784, "learning_rate": 0.0002, "epoch": 4.0237467018469655, "step": 3050}, {"loss": 0.7055, "grad_norm": 0.8106053471565247, "learning_rate": 0.0002, "epoch": 4.036939313984169, "step": 3060}, {"loss": 0.6846, "grad_norm": 0.8675326704978943, "learning_rate": 0.0002, "epoch": 4.050131926121372, "step": 3070}, {"loss": 0.6064, "grad_norm": 0.9620490074157715, "learning_rate": 0.0002, "epoch": 4.063324538258575, "step": 3080}, {"loss": 0.6047, "grad_norm": 0.8996296525001526, "learning_rate": 0.0002, "epoch": 4.076517150395778, "step": 3090}, {"loss": 0.6111, "grad_norm": 0.8648998737335205, "learning_rate": 0.0002, "epoch": 4.089709762532982, "step": 3100}, {"loss": 0.5853, "grad_norm": 1.0321335792541504, "learning_rate": 0.0002, "epoch": 4.102902374670185, "step": 3110}, {"loss": 0.6161, "grad_norm": 0.7949225306510925, "learning_rate": 0.0002, "epoch": 4.116094986807388, "step": 3120}, {"loss": 0.6354, "grad_norm": 0.9684646129608154, "learning_rate": 0.0002, "epoch": 4.129287598944591, "step": 3130}, {"loss": 0.6198, "grad_norm": 0.8698066473007202, "learning_rate": 0.0002, "epoch": 4.142480211081795, "step": 3140}, {"loss": 0.7185, "grad_norm": 0.7688450813293457, "learning_rate": 0.0002, "epoch": 4.155672823218997, "step": 3150}, {"loss": 0.6053, "grad_norm": 0.9682092070579529, "learning_rate": 0.0002, "epoch": 4.1688654353562, "step": 3160}, {"loss": 0.6827, "grad_norm": 0.961561918258667, "learning_rate": 0.0002, "epoch": 4.1820580474934035, "step": 3170}, {"loss": 0.6403, "grad_norm": 1.3962990045547485, "learning_rate": 0.0002, "epoch": 4.195250659630607, "step": 3180}, {"loss": 0.6319, "grad_norm": 0.9485045075416565, "learning_rate": 0.0002, "epoch": 4.20844327176781, "step": 3190}, {"loss": 0.5908, "grad_norm": 0.7768281698226929, "learning_rate": 0.0002, "epoch": 4.221635883905013, "step": 3200}, {"loss": 0.6365, "grad_norm": 1.2685691118240356, "learning_rate": 0.0002, "epoch": 4.2348284960422165, "step": 3210}, {"loss": 0.6601, "grad_norm": 0.6876471638679504, "learning_rate": 0.0002, "epoch": 4.24802110817942, "step": 3220}, {"loss": 0.6274, "grad_norm": 1.0074554681777954, "learning_rate": 0.0002, "epoch": 4.261213720316623, "step": 3230}, {"loss": 0.6027, "grad_norm": 0.8094777464866638, "learning_rate": 0.0002, "epoch": 4.274406332453826, "step": 3240}, {"loss": 0.643, "grad_norm": 0.7906569242477417, "learning_rate": 0.0002, "epoch": 4.287598944591029, "step": 3250}, {"loss": 0.5909, "grad_norm": 0.840238630771637, "learning_rate": 0.0002, "epoch": 4.300791556728232, "step": 3260}, {"loss": 0.5943, "grad_norm": 1.0119295120239258, "learning_rate": 0.0002, "epoch": 4.313984168865435, "step": 3270}, {"loss": 0.5912, "grad_norm": 0.7943191528320312, "learning_rate": 0.0002, "epoch": 4.327176781002638, "step": 3280}, {"loss": 0.6235, "grad_norm": 0.7691723704338074, "learning_rate": 0.0002, "epoch": 4.3403693931398415, "step": 3290}, {"loss": 0.6173, "grad_norm": 0.7227770686149597, "learning_rate": 0.0002, "epoch": 4.353562005277045, "step": 3300}, {"loss": 0.6047, "grad_norm": 0.8512253165245056, "learning_rate": 0.0002, "epoch": 4.366754617414248, "step": 3310}, {"loss": 0.5849, "grad_norm": 0.7852529287338257, "learning_rate": 0.0002, "epoch": 4.379947229551451, "step": 3320}, {"loss": 0.6416, "grad_norm": 0.8888797163963318, "learning_rate": 0.0002, "epoch": 4.3931398416886545, "step": 3330}, {"loss": 0.6804, "grad_norm": 0.9522430896759033, "learning_rate": 0.0002, "epoch": 4.406332453825858, "step": 3340}, {"loss": 0.6345, "grad_norm": 0.900276780128479, "learning_rate": 0.0002, "epoch": 4.419525065963061, "step": 3350}, {"loss": 0.7055, "grad_norm": 1.181547999382019, "learning_rate": 0.0002, "epoch": 4.432717678100264, "step": 3360}, {"loss": 0.7073, "grad_norm": 0.903142511844635, "learning_rate": 0.0002, "epoch": 4.445910290237467, "step": 3370}, {"loss": 0.7235, "grad_norm": 0.8747565150260925, "learning_rate": 0.0002, "epoch": 4.45910290237467, "step": 3380}, {"loss": 0.7071, "grad_norm": 0.7838051319122314, "learning_rate": 0.0002, "epoch": 4.472295514511873, "step": 3390}, {"loss": 0.5932, "grad_norm": 0.8691313862800598, "learning_rate": 0.0002, "epoch": 4.485488126649076, "step": 3400}, {"loss": 0.7019, "grad_norm": 0.8493868708610535, "learning_rate": 0.0002, "epoch": 4.4986807387862795, "step": 3410}, {"loss": 0.5959, "grad_norm": 1.0104830265045166, "learning_rate": 0.0002, "epoch": 4.511873350923483, "step": 3420}, {"loss": 0.6662, "grad_norm": 1.1716967821121216, "learning_rate": 0.0002, "epoch": 4.525065963060686, "step": 3430}, {"loss": 0.6411, "grad_norm": 0.9122593998908997, "learning_rate": 0.0002, "epoch": 4.538258575197889, "step": 3440}, {"loss": 0.7047, "grad_norm": 0.829090416431427, "learning_rate": 0.0002, "epoch": 4.5514511873350925, "step": 3450}, {"loss": 0.6001, "grad_norm": 1.141662836074829, "learning_rate": 0.0002, "epoch": 4.564643799472296, "step": 3460}, {"loss": 0.6612, "grad_norm": 0.8423182368278503, "learning_rate": 0.0002, "epoch": 4.577836411609499, "step": 3470}, {"loss": 0.6797, "grad_norm": 0.8024184703826904, "learning_rate": 0.0002, "epoch": 4.591029023746702, "step": 3480}, {"loss": 0.7184, "grad_norm": 0.7703381776809692, "learning_rate": 0.0002, "epoch": 4.6042216358839045, "step": 3490}, {"loss": 0.7001, "grad_norm": 0.9883959293365479, "learning_rate": 0.0002, "epoch": 4.617414248021108, "step": 3500}, {"loss": 0.6188, "grad_norm": 0.9554709196090698, "learning_rate": 0.0002, "epoch": 4.630606860158311, "step": 3510}, {"loss": 0.7378, "grad_norm": 1.9949709177017212, "learning_rate": 0.0002, "epoch": 4.643799472295514, "step": 3520}, {"loss": 0.6678, "grad_norm": 0.7762255072593689, "learning_rate": 0.0002, "epoch": 4.6569920844327175, "step": 3530}, {"loss": 0.6298, "grad_norm": 0.9538425803184509, "learning_rate": 0.0002, "epoch": 4.670184696569921, "step": 3540}, {"loss": 0.6352, "grad_norm": 1.0279661417007446, "learning_rate": 0.0002, "epoch": 4.683377308707124, "step": 3550}, {"loss": 0.6641, "grad_norm": 0.7545472979545593, "learning_rate": 0.0002, "epoch": 4.696569920844327, "step": 3560}, {"loss": 0.6887, "grad_norm": 0.8919376730918884, "learning_rate": 0.0002, "epoch": 4.7097625329815305, "step": 3570}, {"loss": 0.6395, "grad_norm": 0.7621569633483887, "learning_rate": 0.0002, "epoch": 4.722955145118734, "step": 3580}, {"loss": 0.6928, "grad_norm": 1.205320119857788, "learning_rate": 0.0002, "epoch": 4.736147757255937, "step": 3590}, {"loss": 0.6612, "grad_norm": 1.0642725229263306, "learning_rate": 0.0002, "epoch": 4.74934036939314, "step": 3600}, {"loss": 0.6541, "grad_norm": 0.9402666687965393, "learning_rate": 0.0002, "epoch": 4.762532981530343, "step": 3610}, {"loss": 0.6395, "grad_norm": 1.254127025604248, "learning_rate": 0.0002, "epoch": 4.775725593667546, "step": 3620}, {"loss": 0.692, "grad_norm": 0.7609598636627197, "learning_rate": 0.0002, "epoch": 4.788918205804749, "step": 3630}, {"loss": 0.6578, "grad_norm": 0.8240329623222351, "learning_rate": 0.0002, "epoch": 4.802110817941952, "step": 3640}, {"loss": 0.7383, "grad_norm": 0.8356260657310486, "learning_rate": 0.0002, "epoch": 4.8153034300791555, "step": 3650}, {"loss": 0.6368, "grad_norm": 0.9130708575248718, "learning_rate": 0.0002, "epoch": 4.828496042216359, "step": 3660}, {"loss": 0.7269, "grad_norm": 0.9384765028953552, "learning_rate": 0.0002, "epoch": 4.841688654353562, "step": 3670}, {"loss": 0.6509, "grad_norm": 0.9829966425895691, "learning_rate": 0.0002, "epoch": 4.854881266490765, "step": 3680}, {"loss": 0.6311, "grad_norm": 1.0488632917404175, "learning_rate": 0.0002, "epoch": 4.8680738786279685, "step": 3690}, {"loss": 0.7005, "grad_norm": 1.2278969287872314, "learning_rate": 0.0002, "epoch": 4.881266490765172, "step": 3700}, {"loss": 0.6869, "grad_norm": 0.8078970313072205, "learning_rate": 0.0002, "epoch": 4.894459102902375, "step": 3710}, {"loss": 0.6588, "grad_norm": 0.8081700205802917, "learning_rate": 0.0002, "epoch": 4.907651715039578, "step": 3720}, {"loss": 0.7189, "grad_norm": 0.9204511046409607, "learning_rate": 0.0002, "epoch": 4.9208443271767806, "step": 3730}, {"loss": 0.6953, "grad_norm": 0.9326391220092773, "learning_rate": 0.0002, "epoch": 4.934036939313984, "step": 3740}, {"loss": 0.68, "grad_norm": 1.0089969635009766, "learning_rate": 0.0002, "epoch": 4.947229551451187, "step": 3750}, {"loss": 0.7031, "grad_norm": 0.7063466906547546, "learning_rate": 0.0002, "epoch": 4.96042216358839, "step": 3760}, {"loss": 0.6568, "grad_norm": 1.2603905200958252, "learning_rate": 0.0002, "epoch": 4.9736147757255935, "step": 3770}, {"loss": 0.7134, "grad_norm": 0.8418653607368469, "learning_rate": 0.0002, "epoch": 4.986807387862797, "step": 3780}, {"loss": 0.6683, "grad_norm": 0.9537181854248047, "learning_rate": 0.0002, "epoch": 5.0, "step": 3790}, {"eval_loss": 1.3319307565689087, "eval_runtime": 71.7836, "eval_samples_per_second": 6.004, "eval_steps_per_second": 0.752, "epoch": 5.0, "step": 3790}, {"loss": 0.489, "grad_norm": 0.8595899343490601, "learning_rate": 0.0002, "epoch": 5.013192612137203, "step": 3800}, {"loss": 0.5155, "grad_norm": 1.0023565292358398, "learning_rate": 0.0002, "epoch": 5.0263852242744065, "step": 3810}, {"loss": 0.5321, "grad_norm": 1.2770460844039917, "learning_rate": 0.0002, "epoch": 5.03957783641161, "step": 3820}, {"loss": 0.5127, "grad_norm": 1.1701956987380981, "learning_rate": 0.0002, "epoch": 5.052770448548813, "step": 3830}, {"loss": 0.5057, "grad_norm": 0.812269926071167, "learning_rate": 0.0002, "epoch": 5.065963060686016, "step": 3840}, {"loss": 0.4292, "grad_norm": 0.8186697363853455, "learning_rate": 0.0002, "epoch": 5.0791556728232194, "step": 3850}, {"loss": 0.4865, "grad_norm": 1.052565097808838, "learning_rate": 0.0002, "epoch": 5.092348284960422, "step": 3860}, {"loss": 0.4947, "grad_norm": 0.9764705300331116, "learning_rate": 0.0002, "epoch": 5.105540897097625, "step": 3870}, {"loss": 0.471, "grad_norm": 0.6973426938056946, "learning_rate": 0.0002, "epoch": 5.118733509234828, "step": 3880}, {"loss": 0.5565, "grad_norm": 1.2127928733825684, "learning_rate": 0.0002, "epoch": 5.1319261213720315, "step": 3890}, {"loss": 0.4122, "grad_norm": 0.682807981967926, "learning_rate": 0.0002, "epoch": 5.145118733509235, "step": 3900}, {"loss": 0.6378, "grad_norm": 1.3575998544692993, "learning_rate": 0.0002, "epoch": 5.158311345646438, "step": 3910}, {"loss": 0.4624, "grad_norm": 1.2581931352615356, "learning_rate": 0.0002, "epoch": 5.171503957783641, "step": 3920}, {"loss": 0.5092, "grad_norm": 1.0493637323379517, "learning_rate": 0.0002, "epoch": 5.1846965699208445, "step": 3930}, {"loss": 0.4563, "grad_norm": 1.3519670963287354, "learning_rate": 0.0002, "epoch": 5.197889182058048, "step": 3940}, {"loss": 0.5414, "grad_norm": 1.0690566301345825, "learning_rate": 0.0002, "epoch": 5.211081794195251, "step": 3950}, {"loss": 0.5038, "grad_norm": 1.1171330213546753, "learning_rate": 0.0002, "epoch": 5.224274406332454, "step": 3960}, {"loss": 0.4397, "grad_norm": 1.055851697921753, "learning_rate": 0.0002, "epoch": 5.237467018469657, "step": 3970}, {"loss": 0.4964, "grad_norm": 0.8870180249214172, "learning_rate": 0.0002, "epoch": 5.25065963060686, "step": 3980}, {"loss": 0.5353, "grad_norm": 0.9688402414321899, "learning_rate": 0.0002, "epoch": 5.263852242744063, "step": 3990}, {"loss": 0.5192, "grad_norm": 0.8458422422409058, "learning_rate": 0.0002, "epoch": 5.277044854881266, "step": 4000}, {"loss": 0.5458, "grad_norm": 0.908256471157074, "learning_rate": 0.0002, "epoch": 5.2902374670184695, "step": 4010}, {"loss": 0.5102, "grad_norm": 1.0058149099349976, "learning_rate": 0.0002, "epoch": 5.303430079155673, "step": 4020}, {"loss": 0.5322, "grad_norm": 1.20364511013031, "learning_rate": 0.0002, "epoch": 5.316622691292876, "step": 4030}, {"loss": 0.5715, "grad_norm": 1.0135732889175415, "learning_rate": 0.0002, "epoch": 5.329815303430079, "step": 4040}, {"loss": 0.4736, "grad_norm": 1.1094907522201538, "learning_rate": 0.0002, "epoch": 5.3430079155672825, "step": 4050}, {"loss": 0.4912, "grad_norm": 1.0373083353042603, "learning_rate": 0.0002, "epoch": 5.356200527704486, "step": 4060}, {"loss": 0.5258, "grad_norm": 1.0952966213226318, "learning_rate": 0.0002, "epoch": 5.369393139841689, "step": 4070}, {"loss": 0.4892, "grad_norm": 1.1734952926635742, "learning_rate": 0.0002, "epoch": 5.382585751978892, "step": 4080}, {"loss": 0.4463, "grad_norm": 0.8217245936393738, "learning_rate": 0.0002, "epoch": 5.395778364116095, "step": 4090}, {"loss": 0.5271, "grad_norm": 1.0936307907104492, "learning_rate": 0.0002, "epoch": 5.408970976253298, "step": 4100}, {"loss": 0.509, "grad_norm": 1.0198720693588257, "learning_rate": 0.0002, "epoch": 5.422163588390501, "step": 4110}, {"loss": 0.5265, "grad_norm": 1.1105809211730957, "learning_rate": 0.0002, "epoch": 5.435356200527704, "step": 4120}, {"loss": 0.4871, "grad_norm": 1.1817213296890259, "learning_rate": 0.0002, "epoch": 5.4485488126649075, "step": 4130}, {"loss": 0.4987, "grad_norm": 1.126339077949524, "learning_rate": 0.0002, "epoch": 5.461741424802111, "step": 4140}, {"loss": 0.5743, "grad_norm": 0.9467914700508118, "learning_rate": 0.0002, "epoch": 5.474934036939314, "step": 4150}, {"loss": 0.5386, "grad_norm": 1.0335774421691895, "learning_rate": 0.0002, "epoch": 5.488126649076517, "step": 4160}, {"loss": 0.5122, "grad_norm": 0.866211473941803, "learning_rate": 0.0002, "epoch": 5.5013192612137205, "step": 4170}, {"loss": 0.5697, "grad_norm": 0.7422948479652405, "learning_rate": 0.0002, "epoch": 5.514511873350924, "step": 4180}, {"loss": 0.586, "grad_norm": 1.2211135625839233, "learning_rate": 0.0002, "epoch": 5.527704485488127, "step": 4190}, {"loss": 0.5476, "grad_norm": 1.0371766090393066, "learning_rate": 0.0002, "epoch": 5.540897097625329, "step": 4200}, {"loss": 0.5941, "grad_norm": 0.9460630416870117, "learning_rate": 0.0002, "epoch": 5.554089709762533, "step": 4210}, {"loss": 0.4645, "grad_norm": 0.7972197532653809, "learning_rate": 0.0002, "epoch": 5.567282321899736, "step": 4220}, {"loss": 0.5087, "grad_norm": 1.0654675960540771, "learning_rate": 0.0002, "epoch": 5.580474934036939, "step": 4230}, {"loss": 0.5957, "grad_norm": 1.0776735544204712, "learning_rate": 0.0002, "epoch": 5.593667546174142, "step": 4240}, {"loss": 0.53, "grad_norm": 1.498723030090332, "learning_rate": 0.0002, "epoch": 5.6068601583113455, "step": 4250}, {"loss": 0.4788, "grad_norm": 1.006768822669983, "learning_rate": 0.0002, "epoch": 5.620052770448549, "step": 4260}, {"loss": 0.5571, "grad_norm": 0.9194242358207703, "learning_rate": 0.0002, "epoch": 5.633245382585752, "step": 4270}, {"loss": 0.5722, "grad_norm": 1.1028380393981934, "learning_rate": 0.0002, "epoch": 5.646437994722955, "step": 4280}, {"loss": 0.5319, "grad_norm": 0.9972755312919617, "learning_rate": 0.0002, "epoch": 5.6596306068601585, "step": 4290}, {"loss": 0.53, "grad_norm": 1.0509438514709473, "learning_rate": 0.0002, "epoch": 5.672823218997362, "step": 4300}, {"loss": 0.4738, "grad_norm": 1.064039945602417, "learning_rate": 0.0002, "epoch": 5.686015831134565, "step": 4310}, {"loss": 0.5401, "grad_norm": 0.9572229981422424, "learning_rate": 0.0002, "epoch": 5.699208443271768, "step": 4320}, {"loss": 0.5173, "grad_norm": 0.9956564903259277, "learning_rate": 0.0002, "epoch": 5.7124010554089715, "step": 4330}, {"loss": 0.6008, "grad_norm": 1.01974618434906, "learning_rate": 0.0002, "epoch": 5.725593667546174, "step": 4340}, {"loss": 0.5111, "grad_norm": 1.101328730583191, "learning_rate": 0.0002, "epoch": 5.738786279683377, "step": 4350}, {"loss": 0.5921, "grad_norm": 0.9971756935119629, "learning_rate": 0.0002, "epoch": 5.75197889182058, "step": 4360}, {"loss": 0.5262, "grad_norm": 0.8579474687576294, "learning_rate": 0.0002, "epoch": 5.7651715039577835, "step": 4370}, {"loss": 0.5106, "grad_norm": 0.9927367568016052, "learning_rate": 0.0002, "epoch": 5.778364116094987, "step": 4380}, {"loss": 0.5354, "grad_norm": 1.1183884143829346, "learning_rate": 0.0002, "epoch": 5.79155672823219, "step": 4390}, {"loss": 0.5658, "grad_norm": 0.7695905566215515, "learning_rate": 0.0002, "epoch": 5.804749340369393, "step": 4400}, {"loss": 0.5137, "grad_norm": 1.1102122068405151, "learning_rate": 0.0002, "epoch": 5.8179419525065965, "step": 4410}, {"loss": 0.5634, "grad_norm": 1.3201336860656738, "learning_rate": 0.0002, "epoch": 5.8311345646438, "step": 4420}, {"loss": 0.5773, "grad_norm": 1.1934558153152466, "learning_rate": 0.0002, "epoch": 5.844327176781003, "step": 4430}, {"loss": 0.6338, "grad_norm": 1.390870451927185, "learning_rate": 0.0002, "epoch": 5.857519788918205, "step": 4440}, {"loss": 0.5625, "grad_norm": 1.056314468383789, "learning_rate": 0.0002, "epoch": 5.870712401055409, "step": 4450}, {"loss": 0.6456, "grad_norm": 0.9797437191009521, "learning_rate": 0.0002, "epoch": 5.883905013192612, "step": 4460}, {"loss": 0.5479, "grad_norm": 1.2368146181106567, "learning_rate": 0.0002, "epoch": 5.897097625329815, "step": 4470}, {"loss": 0.5453, "grad_norm": 0.9062654376029968, "learning_rate": 0.0002, "epoch": 5.910290237467018, "step": 4480}, {"loss": 0.5857, "grad_norm": 1.8643536567687988, "learning_rate": 0.0002, "epoch": 5.923482849604222, "step": 4490}, {"loss": 0.5858, "grad_norm": 1.2977997064590454, "learning_rate": 0.0002, "epoch": 5.936675461741425, "step": 4500}, {"loss": 0.4815, "grad_norm": 0.8366201519966125, "learning_rate": 0.0002, "epoch": 5.949868073878628, "step": 4510}, {"loss": 0.5126, "grad_norm": 1.0210131406784058, "learning_rate": 0.0002, "epoch": 5.963060686015831, "step": 4520}, {"loss": 0.5577, "grad_norm": 1.1287827491760254, "learning_rate": 0.0002, "epoch": 5.9762532981530345, "step": 4530}, {"loss": 0.5053, "grad_norm": 1.0480493307113647, "learning_rate": 0.0002, "epoch": 5.989445910290238, "step": 4540}, {"eval_loss": 1.450880765914917, "eval_runtime": 71.8135, "eval_samples_per_second": 6.002, "eval_steps_per_second": 0.752, "epoch": 6.0, "step": 4548}, {"loss": 0.5072, "grad_norm": 0.8589069247245789, "learning_rate": 0.0002, "epoch": 6.002638522427441, "step": 4550}, {"loss": 0.4129, "grad_norm": 1.467134714126587, "learning_rate": 0.0002, "epoch": 6.015831134564644, "step": 4560}, {"loss": 0.3739, "grad_norm": 1.1477625370025635, "learning_rate": 0.0002, "epoch": 6.029023746701847, "step": 4570}, {"loss": 0.3958, "grad_norm": 1.4254094362258911, "learning_rate": 0.0002, "epoch": 6.04221635883905, "step": 4580}, {"loss": 0.356, "grad_norm": 1.3656290769577026, "learning_rate": 0.0002, "epoch": 6.055408970976253, "step": 4590}, {"loss": 0.3626, "grad_norm": 0.9638674855232239, "learning_rate": 0.0002, "epoch": 6.068601583113456, "step": 4600}, {"loss": 0.3884, "grad_norm": 1.2654615640640259, "learning_rate": 0.0002, "epoch": 6.08179419525066, "step": 4610}, {"loss": 0.4659, "grad_norm": 1.4506969451904297, "learning_rate": 0.0002, "epoch": 6.094986807387863, "step": 4620}, {"loss": 0.3096, "grad_norm": 1.6596732139587402, "learning_rate": 0.0002, "epoch": 6.108179419525066, "step": 4630}, {"loss": 0.4005, "grad_norm": 1.5335280895233154, "learning_rate": 0.0002, "epoch": 6.121372031662269, "step": 4640}, {"loss": 0.3999, "grad_norm": 1.0815565586090088, "learning_rate": 0.0002, "epoch": 6.1345646437994725, "step": 4650}, {"loss": 0.4026, "grad_norm": 0.9995638132095337, "learning_rate": 0.0002, "epoch": 6.147757255936676, "step": 4660}, {"loss": 0.3548, "grad_norm": 0.8809106349945068, "learning_rate": 0.0002, "epoch": 6.160949868073879, "step": 4670}, {"loss": 0.4505, "grad_norm": 1.2946726083755493, "learning_rate": 0.0002, "epoch": 6.174142480211081, "step": 4680}, {"loss": 0.4447, "grad_norm": 1.311298131942749, "learning_rate": 0.0002, "epoch": 6.187335092348285, "step": 4690}, {"loss": 0.4108, "grad_norm": 1.229204535484314, "learning_rate": 0.0002, "epoch": 6.200527704485488, "step": 4700}, {"loss": 0.3764, "grad_norm": 1.0193822383880615, "learning_rate": 0.0002, "epoch": 6.213720316622691, "step": 4710}, {"loss": 0.3696, "grad_norm": 1.4438618421554565, "learning_rate": 0.0002, "epoch": 6.226912928759894, "step": 4720}, {"loss": 0.3979, "grad_norm": 1.4315637350082397, "learning_rate": 0.0002, "epoch": 6.240105540897098, "step": 4730}, {"loss": 0.4124, "grad_norm": 1.1291239261627197, "learning_rate": 0.0002, "epoch": 6.253298153034301, "step": 4740}, {"loss": 0.4337, "grad_norm": 0.9358022809028625, "learning_rate": 0.0002, "epoch": 6.266490765171504, "step": 4750}, {"loss": 0.3758, "grad_norm": 1.1260714530944824, "learning_rate": 0.0002, "epoch": 6.279683377308707, "step": 4760}, {"loss": 0.4262, "grad_norm": 1.5400320291519165, "learning_rate": 0.0002, "epoch": 6.2928759894459105, "step": 4770}, {"loss": 0.4105, "grad_norm": 1.6820714473724365, "learning_rate": 0.0002, "epoch": 6.306068601583114, "step": 4780}, {"loss": 0.4192, "grad_norm": 1.1937718391418457, "learning_rate": 0.0002, "epoch": 6.319261213720317, "step": 4790}, {"loss": 0.4519, "grad_norm": 1.4330145120620728, "learning_rate": 0.0002, "epoch": 6.33245382585752, "step": 4800}, {"loss": 0.4173, "grad_norm": 1.083373785018921, "learning_rate": 0.0002, "epoch": 6.345646437994723, "step": 4810}, {"loss": 0.4054, "grad_norm": 1.3013869524002075, "learning_rate": 0.0002, "epoch": 6.358839050131926, "step": 4820}, {"loss": 0.4177, "grad_norm": 1.1075547933578491, "learning_rate": 0.0002, "epoch": 6.372031662269129, "step": 4830}, {"loss": 0.3846, "grad_norm": 1.0480214357376099, "learning_rate": 0.0002, "epoch": 6.385224274406332, "step": 4840}, {"loss": 0.3924, "grad_norm": 1.3625658750534058, "learning_rate": 0.0002, "epoch": 6.398416886543536, "step": 4850}, {"loss": 0.3964, "grad_norm": 1.16606605052948, "learning_rate": 0.0002, "epoch": 6.411609498680739, "step": 4860}, {"loss": 0.4845, "grad_norm": 1.2435568571090698, "learning_rate": 0.0002, "epoch": 6.424802110817942, "step": 4870}, {"loss": 0.3847, "grad_norm": 1.4471954107284546, "learning_rate": 0.0002, "epoch": 6.437994722955145, "step": 4880}, {"loss": 0.443, "grad_norm": 1.2302275896072388, "learning_rate": 0.0002, "epoch": 6.4511873350923485, "step": 4890}, {"loss": 0.4458, "grad_norm": 1.2392226457595825, "learning_rate": 0.0002, "epoch": 6.464379947229552, "step": 4900}, {"loss": 0.4114, "grad_norm": 1.0497277975082397, "learning_rate": 0.0002, "epoch": 6.477572559366755, "step": 4910}, {"loss": 0.426, "grad_norm": 1.3509557247161865, "learning_rate": 0.0002, "epoch": 6.490765171503957, "step": 4920}, {"loss": 0.4089, "grad_norm": 1.340214729309082, "learning_rate": 0.0002, "epoch": 6.503957783641161, "step": 4930}, {"loss": 0.4655, "grad_norm": 1.283220648765564, "learning_rate": 0.0002, "epoch": 6.517150395778364, "step": 4940}, {"loss": 0.4205, "grad_norm": 1.0693278312683105, "learning_rate": 0.0002, "epoch": 6.530343007915567, "step": 4950}, {"loss": 0.398, "grad_norm": 1.307997226715088, "learning_rate": 0.0002, "epoch": 6.54353562005277, "step": 4960}, {"loss": 0.3844, "grad_norm": 1.1739027500152588, "learning_rate": 0.0002, "epoch": 6.556728232189974, "step": 4970}, {"loss": 0.4494, "grad_norm": 1.5694327354431152, "learning_rate": 0.0002, "epoch": 6.569920844327177, "step": 4980}, {"loss": 0.4535, "grad_norm": 0.9978346824645996, "learning_rate": 0.0002, "epoch": 6.58311345646438, "step": 4990}, {"loss": 0.4755, "grad_norm": 1.183057427406311, "learning_rate": 0.0002, "epoch": 6.596306068601583, "step": 5000}, {"loss": 0.4688, "grad_norm": 1.1033718585968018, "learning_rate": 0.0002, "epoch": 6.6094986807387865, "step": 5010}, {"loss": 0.4233, "grad_norm": 1.0699188709259033, "learning_rate": 0.0002, "epoch": 6.62269129287599, "step": 5020}, {"loss": 0.4049, "grad_norm": 1.491031289100647, "learning_rate": 0.0002, "epoch": 6.635883905013193, "step": 5030}, {"loss": 0.4257, "grad_norm": 0.7939618825912476, "learning_rate": 0.0002, "epoch": 6.649076517150396, "step": 5040}, {"loss": 0.4273, "grad_norm": 1.2883116006851196, "learning_rate": 0.0002, "epoch": 6.662269129287599, "step": 5050}, {"loss": 0.4376, "grad_norm": 1.3844388723373413, "learning_rate": 0.0002, "epoch": 6.675461741424802, "step": 5060}, {"loss": 0.4078, "grad_norm": 1.1823489665985107, "learning_rate": 0.0002, "epoch": 6.688654353562005, "step": 5070}, {"loss": 0.4811, "grad_norm": 1.310214638710022, "learning_rate": 0.0002, "epoch": 6.701846965699208, "step": 5080}, {"loss": 0.4675, "grad_norm": 1.6253955364227295, "learning_rate": 0.0002, "epoch": 6.715039577836412, "step": 5090}, {"loss": 0.4749, "grad_norm": 1.3344792127609253, "learning_rate": 0.0002, "epoch": 6.728232189973615, "step": 5100}, {"loss": 0.4051, "grad_norm": 1.3900614976882935, "learning_rate": 0.0002, "epoch": 6.741424802110818, "step": 5110}, {"loss": 0.3782, "grad_norm": 1.5122374296188354, "learning_rate": 0.0002, "epoch": 6.754617414248021, "step": 5120}, {"loss": 0.4439, "grad_norm": 1.4738229513168335, "learning_rate": 0.0002, "epoch": 6.7678100263852246, "step": 5130}, {"loss": 0.4237, "grad_norm": 1.0417664051055908, "learning_rate": 0.0002, "epoch": 6.781002638522428, "step": 5140}, {"loss": 0.486, "grad_norm": 1.1339401006698608, "learning_rate": 0.0002, "epoch": 6.79419525065963, "step": 5150}, {"loss": 0.4387, "grad_norm": 1.4377150535583496, "learning_rate": 0.0002, "epoch": 6.807387862796833, "step": 5160}, {"loss": 0.4375, "grad_norm": 1.3321975469589233, "learning_rate": 0.0002, "epoch": 6.820580474934037, "step": 5170}, {"loss": 0.4369, "grad_norm": 1.3799545764923096, "learning_rate": 0.0002, "epoch": 6.83377308707124, "step": 5180}, {"loss": 0.4266, "grad_norm": 0.864224374294281, "learning_rate": 0.0002, "epoch": 6.846965699208443, "step": 5190}, {"loss": 0.4455, "grad_norm": 1.0666139125823975, "learning_rate": 0.0002, "epoch": 6.860158311345646, "step": 5200}, {"loss": 0.4545, "grad_norm": 1.2926141023635864, "learning_rate": 0.0002, "epoch": 6.87335092348285, "step": 5210}, {"loss": 0.4441, "grad_norm": 1.2046207189559937, "learning_rate": 0.0002, "epoch": 6.886543535620053, "step": 5220}, {"loss": 0.4458, "grad_norm": 1.3961530923843384, "learning_rate": 0.0002, "epoch": 6.899736147757256, "step": 5230}, {"loss": 0.4343, "grad_norm": 1.1340336799621582, "learning_rate": 0.0002, "epoch": 6.912928759894459, "step": 5240}, {"loss": 0.4491, "grad_norm": 1.1756815910339355, "learning_rate": 0.0002, "epoch": 6.926121372031663, "step": 5250}, {"loss": 0.4077, "grad_norm": 1.146964192390442, "learning_rate": 0.0002, "epoch": 6.939313984168866, "step": 5260}, {"loss": 0.4232, "grad_norm": 1.2974623441696167, "learning_rate": 0.0002, "epoch": 6.952506596306069, "step": 5270}, {"loss": 0.4126, "grad_norm": 1.342126727104187, "learning_rate": 0.0002, "epoch": 6.965699208443271, "step": 5280}, {"loss": 0.4537, "grad_norm": 1.2475614547729492, "learning_rate": 0.0002, "epoch": 6.978891820580475, "step": 5290}, {"loss": 0.456, "grad_norm": 1.254935622215271, "learning_rate": 0.0002, "epoch": 6.992084432717678, "step": 5300}]} +{"epoch": 8.0, "step": 6064, "epoch_duration": 2068.3929238319397, "total_accumulated_duration": 17302.134539842606, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-42/checkpoint-1516", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9466, "grad_norm": 0.7545632123947144, "learning_rate": 0.0002, "epoch": 0.013192612137203167, "step": 10}, {"loss": 1.4909, "grad_norm": 0.5787661075592041, "learning_rate": 0.0002, "epoch": 0.026385224274406333, "step": 20}, {"loss": 1.3906, "grad_norm": 0.8616093993186951, "learning_rate": 0.0002, "epoch": 0.0395778364116095, "step": 30}, {"loss": 1.4116, "grad_norm": 0.42088547348976135, "learning_rate": 0.0002, "epoch": 0.052770448548812667, "step": 40}, {"loss": 1.3446, "grad_norm": 0.47704678773880005, "learning_rate": 0.0002, "epoch": 0.06596306068601583, "step": 50}, {"loss": 1.2476, "grad_norm": 0.5763994455337524, "learning_rate": 0.0002, "epoch": 0.079155672823219, "step": 60}, {"loss": 1.2268, "grad_norm": 0.4579846262931824, "learning_rate": 0.0002, "epoch": 0.09234828496042216, "step": 70}, {"loss": 1.415, "grad_norm": 0.46623846888542175, "learning_rate": 0.0002, "epoch": 0.10554089709762533, "step": 80}, {"loss": 1.2849, "grad_norm": 0.4206956624984741, "learning_rate": 0.0002, "epoch": 0.11873350923482849, "step": 90}, {"loss": 1.1608, "grad_norm": 0.41896629333496094, "learning_rate": 0.0002, "epoch": 0.13192612137203166, "step": 100}, {"loss": 1.2617, "grad_norm": 0.3459089398384094, "learning_rate": 0.0002, "epoch": 0.14511873350923482, "step": 110}, {"loss": 1.2858, "grad_norm": 0.4587327837944031, "learning_rate": 0.0002, "epoch": 0.158311345646438, "step": 120}, {"loss": 1.2241, "grad_norm": 0.433525413274765, "learning_rate": 0.0002, "epoch": 0.17150395778364116, "step": 130}, {"loss": 1.2269, "grad_norm": 0.39253175258636475, "learning_rate": 0.0002, "epoch": 0.18469656992084432, "step": 140}, {"loss": 1.2224, "grad_norm": 0.3602290749549866, "learning_rate": 0.0002, "epoch": 0.19788918205804748, "step": 150}, {"loss": 1.2834, "grad_norm": 0.41160839796066284, "learning_rate": 0.0002, "epoch": 0.21108179419525067, "step": 160}, {"loss": 1.1986, "grad_norm": 0.7213630080223083, "learning_rate": 0.0002, "epoch": 0.22427440633245382, "step": 170}, {"loss": 1.2215, "grad_norm": 0.39086055755615234, "learning_rate": 0.0002, "epoch": 0.23746701846965698, "step": 180}, {"loss": 1.3315, "grad_norm": 0.4465520977973938, "learning_rate": 0.0002, "epoch": 0.25065963060686014, "step": 190}, {"loss": 1.3798, "grad_norm": 1.814679741859436, "learning_rate": 0.0002, "epoch": 0.2638522427440633, "step": 200}, {"loss": 1.2259, "grad_norm": 0.5026423931121826, "learning_rate": 0.0002, "epoch": 0.2770448548812665, "step": 210}, {"loss": 1.3306, "grad_norm": 0.4156292974948883, "learning_rate": 0.0002, "epoch": 0.29023746701846964, "step": 220}, {"loss": 1.266, "grad_norm": 0.40813493728637695, "learning_rate": 0.0002, "epoch": 0.3034300791556728, "step": 230}, {"loss": 1.1533, "grad_norm": 0.3304787874221802, "learning_rate": 0.0002, "epoch": 0.316622691292876, "step": 240}, {"loss": 1.3154, "grad_norm": 0.46139976382255554, "learning_rate": 0.0002, "epoch": 0.32981530343007914, "step": 250}, {"loss": 1.1365, "grad_norm": 0.37518271803855896, "learning_rate": 0.0002, "epoch": 0.34300791556728233, "step": 260}, {"loss": 1.2004, "grad_norm": 0.35586467385292053, "learning_rate": 0.0002, "epoch": 0.3562005277044855, "step": 270}, {"loss": 1.2609, "grad_norm": 0.32441186904907227, "learning_rate": 0.0002, "epoch": 0.36939313984168864, "step": 280}, {"loss": 1.212, "grad_norm": 0.3198683261871338, "learning_rate": 0.0002, "epoch": 0.38258575197889183, "step": 290}, {"loss": 1.2465, "grad_norm": 0.33663108944892883, "learning_rate": 0.0002, "epoch": 0.39577836411609496, "step": 300}, {"loss": 1.1702, "grad_norm": 0.3711244761943817, "learning_rate": 0.0002, "epoch": 0.40897097625329815, "step": 310}, {"loss": 1.0871, "grad_norm": 0.3209651708602905, "learning_rate": 0.0002, "epoch": 0.42216358839050133, "step": 320}, {"loss": 1.2728, "grad_norm": 0.5152716040611267, "learning_rate": 0.0002, "epoch": 0.43535620052770446, "step": 330}, {"loss": 1.1833, "grad_norm": 0.5431376695632935, "learning_rate": 0.0002, "epoch": 0.44854881266490765, "step": 340}, {"loss": 1.1344, "grad_norm": 0.3069997727870941, "learning_rate": 0.0002, "epoch": 0.46174142480211083, "step": 350}, {"loss": 1.2742, "grad_norm": 0.34260064363479614, "learning_rate": 0.0002, "epoch": 0.47493403693931396, "step": 360}, {"loss": 1.3161, "grad_norm": 0.345653235912323, "learning_rate": 0.0002, "epoch": 0.48812664907651715, "step": 370}, {"loss": 1.1552, "grad_norm": 0.46222734451293945, "learning_rate": 0.0002, "epoch": 0.5013192612137203, "step": 380}, {"loss": 1.0616, "grad_norm": 0.27301734685897827, "learning_rate": 0.0002, "epoch": 0.5145118733509235, "step": 390}, {"loss": 1.1944, "grad_norm": 0.29048439860343933, "learning_rate": 0.0002, "epoch": 0.5277044854881267, "step": 400}, {"loss": 1.0956, "grad_norm": 0.32927802205085754, "learning_rate": 0.0002, "epoch": 0.5408970976253298, "step": 410}, {"loss": 1.2362, "grad_norm": 0.3336397409439087, "learning_rate": 0.0002, "epoch": 0.554089709762533, "step": 420}, {"loss": 1.1445, "grad_norm": 0.4007597267627716, "learning_rate": 0.0002, "epoch": 0.5672823218997362, "step": 430}, {"loss": 1.2731, "grad_norm": 0.36144956946372986, "learning_rate": 0.0002, "epoch": 0.5804749340369393, "step": 440}, {"loss": 1.1604, "grad_norm": 0.6331009864807129, "learning_rate": 0.0002, "epoch": 0.5936675461741425, "step": 450}, {"loss": 1.1692, "grad_norm": 0.41469088196754456, "learning_rate": 0.0002, "epoch": 0.6068601583113457, "step": 460}, {"loss": 1.2372, "grad_norm": 0.4388185143470764, "learning_rate": 0.0002, "epoch": 0.6200527704485488, "step": 470}, {"loss": 1.2541, "grad_norm": 0.3738141655921936, "learning_rate": 0.0002, "epoch": 0.633245382585752, "step": 480}, {"loss": 1.1265, "grad_norm": 0.7212023138999939, "learning_rate": 0.0002, "epoch": 0.6464379947229552, "step": 490}, {"loss": 1.2786, "grad_norm": 0.2972351014614105, "learning_rate": 0.0002, "epoch": 0.6596306068601583, "step": 500}, {"loss": 1.1739, "grad_norm": 0.45293179154396057, "learning_rate": 0.0002, "epoch": 0.6728232189973615, "step": 510}, {"loss": 1.255, "grad_norm": 0.4319860637187958, "learning_rate": 0.0002, "epoch": 0.6860158311345647, "step": 520}, {"loss": 1.0959, "grad_norm": 0.3050215542316437, "learning_rate": 0.0002, "epoch": 0.6992084432717678, "step": 530}, {"loss": 1.1608, "grad_norm": 0.3552611172199249, "learning_rate": 0.0002, "epoch": 0.712401055408971, "step": 540}, {"loss": 1.2546, "grad_norm": 0.3631151020526886, "learning_rate": 0.0002, "epoch": 0.7255936675461742, "step": 550}, {"loss": 1.1371, "grad_norm": 0.28177931904792786, "learning_rate": 0.0002, "epoch": 0.7387862796833773, "step": 560}, {"loss": 1.2081, "grad_norm": 0.359764039516449, "learning_rate": 0.0002, "epoch": 0.7519788918205804, "step": 570}, {"loss": 1.1356, "grad_norm": 0.3970327377319336, "learning_rate": 0.0002, "epoch": 0.7651715039577837, "step": 580}, {"loss": 1.309, "grad_norm": 0.3541001081466675, "learning_rate": 0.0002, "epoch": 0.7783641160949868, "step": 590}, {"loss": 1.2061, "grad_norm": 0.3478573262691498, "learning_rate": 0.0002, "epoch": 0.7915567282321899, "step": 600}, {"loss": 1.1864, "grad_norm": 0.3900321424007416, "learning_rate": 0.0002, "epoch": 0.8047493403693932, "step": 610}, {"loss": 1.1358, "grad_norm": 0.3443238437175751, "learning_rate": 0.0002, "epoch": 0.8179419525065963, "step": 620}, {"loss": 1.3232, "grad_norm": 0.44238781929016113, "learning_rate": 0.0002, "epoch": 0.8311345646437994, "step": 630}, {"loss": 1.1247, "grad_norm": 0.36339467763900757, "learning_rate": 0.0002, "epoch": 0.8443271767810027, "step": 640}, {"loss": 1.1049, "grad_norm": 0.6243070363998413, "learning_rate": 0.0002, "epoch": 0.8575197889182058, "step": 650}, {"loss": 1.1943, "grad_norm": 0.3209173381328583, "learning_rate": 0.0002, "epoch": 0.8707124010554089, "step": 660}, {"loss": 1.1927, "grad_norm": 0.35017991065979004, "learning_rate": 0.0002, "epoch": 0.8839050131926122, "step": 670}, {"loss": 1.1603, "grad_norm": 0.3247159421443939, "learning_rate": 0.0002, "epoch": 0.8970976253298153, "step": 680}, {"loss": 1.2298, "grad_norm": 0.4091894030570984, "learning_rate": 0.0002, "epoch": 0.9102902374670184, "step": 690}, {"loss": 1.2756, "grad_norm": 0.3975585997104645, "learning_rate": 0.0002, "epoch": 0.9234828496042217, "step": 700}, {"loss": 1.281, "grad_norm": 0.3666245937347412, "learning_rate": 0.0002, "epoch": 0.9366754617414248, "step": 710}, {"loss": 1.2855, "grad_norm": 0.45216917991638184, "learning_rate": 0.0002, "epoch": 0.9498680738786279, "step": 720}, {"loss": 1.2217, "grad_norm": 0.36108118295669556, "learning_rate": 0.0002, "epoch": 0.9630606860158312, "step": 730}, {"loss": 1.17, "grad_norm": 0.44550251960754395, "learning_rate": 0.0002, "epoch": 0.9762532981530343, "step": 740}, {"loss": 1.0672, "grad_norm": 0.29801255464553833, "learning_rate": 0.0002, "epoch": 0.9894459102902374, "step": 750}, {"eval_loss": 1.203244686126709, "eval_runtime": 76.0457, "eval_samples_per_second": 5.668, "eval_steps_per_second": 0.71, "epoch": 1.0, "step": 758}, {"loss": 1.0748, "grad_norm": 0.4096551239490509, "learning_rate": 0.0002, "epoch": 1.0026385224274406, "step": 760}, {"loss": 1.1537, "grad_norm": 0.2649582326412201, "learning_rate": 0.0002, "epoch": 1.0158311345646438, "step": 770}, {"loss": 1.1676, "grad_norm": 0.3100722134113312, "learning_rate": 0.0002, "epoch": 1.029023746701847, "step": 780}, {"loss": 1.1736, "grad_norm": 0.3911755383014679, "learning_rate": 0.0002, "epoch": 1.04221635883905, "step": 790}, {"loss": 1.0354, "grad_norm": 0.4600953757762909, "learning_rate": 0.0002, "epoch": 1.0554089709762533, "step": 800}, {"loss": 1.2869, "grad_norm": 0.28671619296073914, "learning_rate": 0.0002, "epoch": 1.0686015831134565, "step": 810}, {"loss": 1.1441, "grad_norm": 0.47282642126083374, "learning_rate": 0.0002, "epoch": 1.0817941952506596, "step": 820}, {"loss": 1.1368, "grad_norm": 0.690073549747467, "learning_rate": 0.0002, "epoch": 1.0949868073878628, "step": 830}, {"loss": 0.9944, "grad_norm": 0.7317902445793152, "learning_rate": 0.0002, "epoch": 1.108179419525066, "step": 840}, {"loss": 1.108, "grad_norm": 0.44215938448905945, "learning_rate": 0.0002, "epoch": 1.121372031662269, "step": 850}, {"loss": 1.0558, "grad_norm": 0.33875149488449097, "learning_rate": 0.0002, "epoch": 1.1345646437994723, "step": 860}, {"loss": 1.0471, "grad_norm": 0.3700002431869507, "learning_rate": 0.0002, "epoch": 1.1477572559366755, "step": 870}, {"loss": 1.0121, "grad_norm": 0.41173291206359863, "learning_rate": 0.0002, "epoch": 1.1609498680738786, "step": 880}, {"loss": 1.0312, "grad_norm": 0.5253589749336243, "learning_rate": 0.0002, "epoch": 1.1741424802110818, "step": 890}, {"loss": 1.2238, "grad_norm": 0.3912237286567688, "learning_rate": 0.0002, "epoch": 1.187335092348285, "step": 900}, {"loss": 0.9434, "grad_norm": 0.40990331768989563, "learning_rate": 0.0002, "epoch": 1.200527704485488, "step": 910}, {"loss": 1.0493, "grad_norm": 0.40377968549728394, "learning_rate": 0.0002, "epoch": 1.2137203166226913, "step": 920}, {"loss": 1.1496, "grad_norm": 0.4605846405029297, "learning_rate": 0.0002, "epoch": 1.2269129287598945, "step": 930}, {"loss": 1.1173, "grad_norm": 0.31564897298812866, "learning_rate": 0.0002, "epoch": 1.2401055408970976, "step": 940}, {"loss": 1.0547, "grad_norm": 0.39808550477027893, "learning_rate": 0.0002, "epoch": 1.2532981530343008, "step": 950}, {"loss": 1.1367, "grad_norm": 0.3762115240097046, "learning_rate": 0.0002, "epoch": 1.266490765171504, "step": 960}, {"loss": 1.1596, "grad_norm": 0.4174984097480774, "learning_rate": 0.0002, "epoch": 1.279683377308707, "step": 970}, {"loss": 1.1327, "grad_norm": 0.5263054966926575, "learning_rate": 0.0002, "epoch": 1.2928759894459103, "step": 980}, {"loss": 1.0339, "grad_norm": 0.41673699021339417, "learning_rate": 0.0002, "epoch": 1.3060686015831133, "step": 990}, {"loss": 1.1198, "grad_norm": 0.9613684415817261, "learning_rate": 0.0002, "epoch": 1.3192612137203166, "step": 1000}, {"loss": 1.0444, "grad_norm": 0.3690216839313507, "learning_rate": 0.0002, "epoch": 1.3324538258575198, "step": 1010}, {"loss": 1.0473, "grad_norm": 0.521821141242981, "learning_rate": 0.0002, "epoch": 1.345646437994723, "step": 1020}, {"loss": 1.1065, "grad_norm": 0.3353094160556793, "learning_rate": 0.0002, "epoch": 1.358839050131926, "step": 1030}, {"loss": 1.1286, "grad_norm": 0.3843843936920166, "learning_rate": 0.0002, "epoch": 1.3720316622691293, "step": 1040}, {"loss": 1.1369, "grad_norm": 0.372514545917511, "learning_rate": 0.0002, "epoch": 1.3852242744063323, "step": 1050}, {"loss": 1.0041, "grad_norm": 0.34537771344184875, "learning_rate": 0.0002, "epoch": 1.3984168865435356, "step": 1060}, {"loss": 1.0251, "grad_norm": 0.45349085330963135, "learning_rate": 0.0002, "epoch": 1.4116094986807388, "step": 1070}, {"loss": 1.1873, "grad_norm": 0.5120177268981934, "learning_rate": 0.0002, "epoch": 1.424802110817942, "step": 1080}, {"loss": 1.0872, "grad_norm": 0.42800238728523254, "learning_rate": 0.0002, "epoch": 1.437994722955145, "step": 1090}, {"loss": 1.0734, "grad_norm": 0.343832790851593, "learning_rate": 0.0002, "epoch": 1.4511873350923483, "step": 1100}, {"loss": 1.1286, "grad_norm": 0.3829841911792755, "learning_rate": 0.0002, "epoch": 1.4643799472295513, "step": 1110}, {"loss": 1.1268, "grad_norm": 0.4289931058883667, "learning_rate": 0.0002, "epoch": 1.4775725593667546, "step": 1120}, {"loss": 1.0676, "grad_norm": 0.42750850319862366, "learning_rate": 0.0002, "epoch": 1.4907651715039578, "step": 1130}, {"loss": 1.072, "grad_norm": 0.34328413009643555, "learning_rate": 0.0002, "epoch": 1.503957783641161, "step": 1140}, {"loss": 1.0863, "grad_norm": 0.349096417427063, "learning_rate": 0.0002, "epoch": 1.517150395778364, "step": 1150}, {"loss": 1.1462, "grad_norm": 0.7700717449188232, "learning_rate": 0.0002, "epoch": 1.5303430079155673, "step": 1160}, {"loss": 0.9528, "grad_norm": 0.39294949173927307, "learning_rate": 0.0002, "epoch": 1.5435356200527703, "step": 1170}, {"loss": 1.1603, "grad_norm": 0.36173608899116516, "learning_rate": 0.0002, "epoch": 1.5567282321899736, "step": 1180}, {"loss": 1.1508, "grad_norm": 0.6034277677536011, "learning_rate": 0.0002, "epoch": 1.5699208443271768, "step": 1190}, {"loss": 1.0105, "grad_norm": 0.36694103479385376, "learning_rate": 0.0002, "epoch": 1.58311345646438, "step": 1200}, {"loss": 1.1479, "grad_norm": 0.4727209508419037, "learning_rate": 0.0002, "epoch": 1.596306068601583, "step": 1210}, {"loss": 1.0689, "grad_norm": 0.6482883095741272, "learning_rate": 0.0002, "epoch": 1.6094986807387863, "step": 1220}, {"loss": 1.1405, "grad_norm": 0.5238035917282104, "learning_rate": 0.0002, "epoch": 1.6226912928759893, "step": 1230}, {"loss": 1.0596, "grad_norm": 0.4812222421169281, "learning_rate": 0.0002, "epoch": 1.6358839050131926, "step": 1240}, {"loss": 1.2729, "grad_norm": 0.7131702303886414, "learning_rate": 0.0002, "epoch": 1.6490765171503958, "step": 1250}, {"loss": 0.9832, "grad_norm": 0.3803327977657318, "learning_rate": 0.0002, "epoch": 1.662269129287599, "step": 1260}, {"loss": 1.1433, "grad_norm": 0.3745088577270508, "learning_rate": 0.0002, "epoch": 1.675461741424802, "step": 1270}, {"loss": 1.1018, "grad_norm": 0.4427378475666046, "learning_rate": 0.0002, "epoch": 1.6886543535620053, "step": 1280}, {"loss": 1.0619, "grad_norm": 0.797478973865509, "learning_rate": 0.0002, "epoch": 1.7018469656992083, "step": 1290}, {"loss": 1.0853, "grad_norm": 0.503620982170105, "learning_rate": 0.0002, "epoch": 1.7150395778364116, "step": 1300}, {"loss": 1.1324, "grad_norm": 0.4132426381111145, "learning_rate": 0.0002, "epoch": 1.7282321899736148, "step": 1310}, {"loss": 1.172, "grad_norm": 0.41811656951904297, "learning_rate": 0.0002, "epoch": 1.741424802110818, "step": 1320}, {"loss": 1.0903, "grad_norm": 0.40647849440574646, "learning_rate": 0.0002, "epoch": 1.754617414248021, "step": 1330}, {"loss": 1.1316, "grad_norm": 0.42138347029685974, "learning_rate": 0.0002, "epoch": 1.767810026385224, "step": 1340}, {"loss": 0.9955, "grad_norm": 0.46523579955101013, "learning_rate": 0.0002, "epoch": 1.7810026385224274, "step": 1350}, {"loss": 1.0307, "grad_norm": 0.39760419726371765, "learning_rate": 0.0002, "epoch": 1.7941952506596306, "step": 1360}, {"loss": 1.1218, "grad_norm": 0.37993717193603516, "learning_rate": 0.0002, "epoch": 1.8073878627968338, "step": 1370}, {"loss": 1.0921, "grad_norm": 0.5404181480407715, "learning_rate": 0.0002, "epoch": 1.820580474934037, "step": 1380}, {"loss": 1.0903, "grad_norm": 0.4385245740413666, "learning_rate": 0.0002, "epoch": 1.83377308707124, "step": 1390}, {"loss": 1.1175, "grad_norm": 0.529797375202179, "learning_rate": 0.0002, "epoch": 1.8469656992084431, "step": 1400}, {"loss": 1.0577, "grad_norm": 0.481567919254303, "learning_rate": 0.0002, "epoch": 1.8601583113456464, "step": 1410}, {"loss": 1.0581, "grad_norm": 0.34787362813949585, "learning_rate": 0.0002, "epoch": 1.8733509234828496, "step": 1420}, {"loss": 1.0536, "grad_norm": 0.6402362585067749, "learning_rate": 0.0002, "epoch": 1.8865435356200528, "step": 1430}, {"loss": 1.0787, "grad_norm": 0.3461322784423828, "learning_rate": 0.0002, "epoch": 1.899736147757256, "step": 1440}, {"loss": 1.0925, "grad_norm": 0.44005653262138367, "learning_rate": 0.0002, "epoch": 1.912928759894459, "step": 1450}, {"loss": 1.0414, "grad_norm": 0.4064280688762665, "learning_rate": 0.0002, "epoch": 1.9261213720316621, "step": 1460}, {"loss": 1.0608, "grad_norm": 0.5236523151397705, "learning_rate": 0.0002, "epoch": 1.9393139841688654, "step": 1470}, {"loss": 1.0572, "grad_norm": 0.41030219197273254, "learning_rate": 0.0002, "epoch": 1.9525065963060686, "step": 1480}, {"loss": 1.1204, "grad_norm": 0.39805835485458374, "learning_rate": 0.0002, "epoch": 1.9656992084432718, "step": 1490}, {"loss": 1.0364, "grad_norm": 0.42974501848220825, "learning_rate": 0.0002, "epoch": 1.978891820580475, "step": 1500}, {"loss": 1.1169, "grad_norm": 0.4688243865966797, "learning_rate": 0.0002, "epoch": 1.992084432717678, "step": 1510}, {"eval_loss": 1.1874967813491821, "eval_runtime": 71.9523, "eval_samples_per_second": 5.99, "eval_steps_per_second": 0.75, "epoch": 2.0, "step": 1516}, {"loss": 1.0401, "grad_norm": 0.4121631383895874, "learning_rate": 0.0002, "epoch": 2.005277044854881, "step": 1520}, {"loss": 0.9384, "grad_norm": 0.4844197928905487, "learning_rate": 0.0002, "epoch": 2.0184696569920844, "step": 1530}, {"loss": 0.8686, "grad_norm": 0.45408546924591064, "learning_rate": 0.0002, "epoch": 2.0316622691292876, "step": 1540}, {"loss": 1.0399, "grad_norm": 0.48662951588630676, "learning_rate": 0.0002, "epoch": 2.044854881266491, "step": 1550}, {"loss": 0.936, "grad_norm": 0.7195899486541748, "learning_rate": 0.0002, "epoch": 2.058047493403694, "step": 1560}, {"loss": 0.9486, "grad_norm": 0.5071077346801758, "learning_rate": 0.0002, "epoch": 2.0712401055408973, "step": 1570}, {"loss": 1.0055, "grad_norm": 0.7473958730697632, "learning_rate": 0.0002, "epoch": 2.0844327176781, "step": 1580}, {"loss": 0.8309, "grad_norm": 0.5509232878684998, "learning_rate": 0.0002, "epoch": 2.0976253298153034, "step": 1590}, {"loss": 0.9181, "grad_norm": 0.5108042359352112, "learning_rate": 0.0002, "epoch": 2.1108179419525066, "step": 1600}, {"loss": 0.9499, "grad_norm": 0.42331448197364807, "learning_rate": 0.0002, "epoch": 2.12401055408971, "step": 1610}, {"loss": 0.9359, "grad_norm": 0.46621623635292053, "learning_rate": 0.0002, "epoch": 2.137203166226913, "step": 1620}, {"loss": 0.9065, "grad_norm": 0.43802836537361145, "learning_rate": 0.0002, "epoch": 2.150395778364116, "step": 1630}, {"loss": 0.9375, "grad_norm": 0.49908021092414856, "learning_rate": 0.0002, "epoch": 2.163588390501319, "step": 1640}, {"loss": 1.0389, "grad_norm": 0.4195636808872223, "learning_rate": 0.0002, "epoch": 2.1767810026385224, "step": 1650}, {"loss": 0.8501, "grad_norm": 0.49515822529792786, "learning_rate": 0.0002, "epoch": 2.1899736147757256, "step": 1660}, {"loss": 1.0557, "grad_norm": 0.4607589542865753, "learning_rate": 0.0002, "epoch": 2.203166226912929, "step": 1670}, {"loss": 1.0206, "grad_norm": 0.4489196836948395, "learning_rate": 0.0002, "epoch": 2.216358839050132, "step": 1680}, {"loss": 0.9657, "grad_norm": 0.49300864338874817, "learning_rate": 0.0002, "epoch": 2.229551451187335, "step": 1690}, {"loss": 0.8371, "grad_norm": 0.6624954342842102, "learning_rate": 0.0002, "epoch": 2.242744063324538, "step": 1700}, {"loss": 0.8555, "grad_norm": 0.8391500115394592, "learning_rate": 0.0002, "epoch": 2.2559366754617414, "step": 1710}, {"loss": 1.0113, "grad_norm": 0.5193073749542236, "learning_rate": 0.0002, "epoch": 2.2691292875989446, "step": 1720}, {"loss": 0.9979, "grad_norm": 0.6180613040924072, "learning_rate": 0.0002, "epoch": 2.282321899736148, "step": 1730}, {"loss": 0.9579, "grad_norm": 0.591191291809082, "learning_rate": 0.0002, "epoch": 2.295514511873351, "step": 1740}, {"loss": 0.8879, "grad_norm": 0.546897828578949, "learning_rate": 0.0002, "epoch": 2.308707124010554, "step": 1750}, {"loss": 0.9321, "grad_norm": 0.5470401644706726, "learning_rate": 0.0002, "epoch": 2.321899736147757, "step": 1760}, {"loss": 0.9104, "grad_norm": 0.4590282738208771, "learning_rate": 0.0002, "epoch": 2.3350923482849604, "step": 1770}, {"loss": 1.0384, "grad_norm": 0.622164785861969, "learning_rate": 0.0002, "epoch": 2.3482849604221636, "step": 1780}, {"loss": 1.0297, "grad_norm": 0.5753812193870544, "learning_rate": 0.0002, "epoch": 2.361477572559367, "step": 1790}, {"loss": 0.9569, "grad_norm": 0.47958624362945557, "learning_rate": 0.0002, "epoch": 2.37467018469657, "step": 1800}, {"loss": 1.0519, "grad_norm": 0.48042672872543335, "learning_rate": 0.0002, "epoch": 2.387862796833773, "step": 1810}, {"loss": 0.9042, "grad_norm": 0.44586366415023804, "learning_rate": 0.0002, "epoch": 2.401055408970976, "step": 1820}, {"loss": 0.9783, "grad_norm": 0.7239416837692261, "learning_rate": 0.0002, "epoch": 2.4142480211081794, "step": 1830}, {"loss": 0.8818, "grad_norm": 0.5515341758728027, "learning_rate": 0.0002, "epoch": 2.4274406332453826, "step": 1840}, {"loss": 0.9503, "grad_norm": 0.6280064582824707, "learning_rate": 0.0002, "epoch": 2.440633245382586, "step": 1850}, {"loss": 0.8943, "grad_norm": 0.4832057058811188, "learning_rate": 0.0002, "epoch": 2.453825857519789, "step": 1860}, {"loss": 0.8744, "grad_norm": 0.5789321064949036, "learning_rate": 0.0002, "epoch": 2.467018469656992, "step": 1870}, {"loss": 0.9332, "grad_norm": 0.48491886258125305, "learning_rate": 0.0002, "epoch": 2.480211081794195, "step": 1880}, {"loss": 0.943, "grad_norm": 0.532365620136261, "learning_rate": 0.0002, "epoch": 2.4934036939313984, "step": 1890}, {"loss": 0.9734, "grad_norm": 0.7087852954864502, "learning_rate": 0.0002, "epoch": 2.5065963060686016, "step": 1900}, {"loss": 0.9767, "grad_norm": 0.48157402873039246, "learning_rate": 0.0002, "epoch": 2.519788918205805, "step": 1910}, {"loss": 0.9851, "grad_norm": 0.5886041522026062, "learning_rate": 0.0002, "epoch": 2.532981530343008, "step": 1920}, {"loss": 1.0144, "grad_norm": 0.6332622766494751, "learning_rate": 0.0002, "epoch": 2.5461741424802113, "step": 1930}, {"loss": 0.9516, "grad_norm": 0.5463117957115173, "learning_rate": 0.0002, "epoch": 2.559366754617414, "step": 1940}, {"loss": 0.9373, "grad_norm": 0.5432228446006775, "learning_rate": 0.0002, "epoch": 2.5725593667546174, "step": 1950}, {"loss": 0.8974, "grad_norm": 0.5929186940193176, "learning_rate": 0.0002, "epoch": 2.5857519788918206, "step": 1960}, {"loss": 1.0062, "grad_norm": 0.5120641589164734, "learning_rate": 0.0002, "epoch": 2.598944591029024, "step": 1970}, {"loss": 0.9143, "grad_norm": 0.5372339487075806, "learning_rate": 0.0002, "epoch": 2.6121372031662267, "step": 1980}, {"loss": 1.0786, "grad_norm": 0.5519838929176331, "learning_rate": 0.0002, "epoch": 2.62532981530343, "step": 1990}, {"loss": 1.021, "grad_norm": 0.7304037809371948, "learning_rate": 0.0002, "epoch": 2.638522427440633, "step": 2000}, {"loss": 0.8708, "grad_norm": 0.6182340979576111, "learning_rate": 0.0002, "epoch": 2.6517150395778364, "step": 2010}, {"loss": 1.0501, "grad_norm": 0.4874444305896759, "learning_rate": 0.0002, "epoch": 2.6649076517150396, "step": 2020}, {"loss": 0.9612, "grad_norm": 0.5850239396095276, "learning_rate": 0.0002, "epoch": 2.678100263852243, "step": 2030}, {"loss": 0.9808, "grad_norm": 0.6495311856269836, "learning_rate": 0.0002, "epoch": 2.691292875989446, "step": 2040}, {"loss": 1.0187, "grad_norm": 1.002830147743225, "learning_rate": 0.0002, "epoch": 2.7044854881266494, "step": 2050}, {"loss": 1.0386, "grad_norm": 0.49076753854751587, "learning_rate": 0.0002, "epoch": 2.717678100263852, "step": 2060}, {"loss": 0.9416, "grad_norm": 0.4736326336860657, "learning_rate": 0.0002, "epoch": 2.7308707124010554, "step": 2070}, {"loss": 0.8424, "grad_norm": 0.5527601838111877, "learning_rate": 0.0002, "epoch": 2.7440633245382586, "step": 2080}, {"loss": 0.9149, "grad_norm": 0.7295718193054199, "learning_rate": 0.0002, "epoch": 2.757255936675462, "step": 2090}, {"loss": 0.9032, "grad_norm": 0.5437536835670471, "learning_rate": 0.0002, "epoch": 2.7704485488126647, "step": 2100}, {"loss": 0.8254, "grad_norm": 0.5997128486633301, "learning_rate": 0.0002, "epoch": 2.783641160949868, "step": 2110}, {"loss": 0.976, "grad_norm": 0.6498191356658936, "learning_rate": 0.0002, "epoch": 2.796833773087071, "step": 2120}, {"loss": 0.9543, "grad_norm": 0.5237268805503845, "learning_rate": 0.0002, "epoch": 2.8100263852242744, "step": 2130}, {"loss": 0.9302, "grad_norm": 0.6033027172088623, "learning_rate": 0.0002, "epoch": 2.8232189973614776, "step": 2140}, {"loss": 0.9625, "grad_norm": 0.6077138781547546, "learning_rate": 0.0002, "epoch": 2.836411609498681, "step": 2150}, {"loss": 0.9347, "grad_norm": 0.4127797484397888, "learning_rate": 0.0002, "epoch": 2.849604221635884, "step": 2160}, {"loss": 1.0459, "grad_norm": 0.8448635339736938, "learning_rate": 0.0002, "epoch": 2.862796833773087, "step": 2170}, {"loss": 0.8185, "grad_norm": 0.5669729113578796, "learning_rate": 0.0002, "epoch": 2.87598944591029, "step": 2180}, {"loss": 0.8555, "grad_norm": 0.510231077671051, "learning_rate": 0.0002, "epoch": 2.8891820580474934, "step": 2190}, {"loss": 0.9267, "grad_norm": 0.8072245121002197, "learning_rate": 0.0002, "epoch": 2.9023746701846966, "step": 2200}, {"loss": 0.9685, "grad_norm": 0.6055923104286194, "learning_rate": 0.0002, "epoch": 2.9155672823219, "step": 2210}, {"loss": 0.9157, "grad_norm": 0.7384416460990906, "learning_rate": 0.0002, "epoch": 2.9287598944591027, "step": 2220}, {"loss": 0.9712, "grad_norm": 0.4922751784324646, "learning_rate": 0.0002, "epoch": 2.941952506596306, "step": 2230}, {"loss": 0.9487, "grad_norm": 0.6039906740188599, "learning_rate": 0.0002, "epoch": 2.955145118733509, "step": 2240}, {"loss": 0.9776, "grad_norm": 0.4751701354980469, "learning_rate": 0.0002, "epoch": 2.9683377308707124, "step": 2250}, {"loss": 1.0619, "grad_norm": 0.5698353052139282, "learning_rate": 0.0002, "epoch": 2.9815303430079156, "step": 2260}, {"loss": 1.1184, "grad_norm": 0.893563449382782, "learning_rate": 0.0002, "epoch": 2.994722955145119, "step": 2270}, {"eval_loss": 1.2046419382095337, "eval_runtime": 71.5992, "eval_samples_per_second": 6.02, "eval_steps_per_second": 0.754, "epoch": 3.0, "step": 2274}, {"loss": 0.8269, "grad_norm": 0.41119325160980225, "learning_rate": 0.0002, "epoch": 3.007915567282322, "step": 2280}, {"loss": 0.7856, "grad_norm": 0.8169420957565308, "learning_rate": 0.0002, "epoch": 3.021108179419525, "step": 2290}, {"loss": 0.794, "grad_norm": 0.6033818125724792, "learning_rate": 0.0002, "epoch": 3.034300791556728, "step": 2300}, {"loss": 0.7607, "grad_norm": 0.9600058197975159, "learning_rate": 0.0002, "epoch": 3.0474934036939314, "step": 2310}, {"loss": 0.8353, "grad_norm": 0.5859250426292419, "learning_rate": 0.0002, "epoch": 3.0606860158311346, "step": 2320}, {"loss": 0.7598, "grad_norm": 0.6758618950843811, "learning_rate": 0.0002, "epoch": 3.073878627968338, "step": 2330}, {"loss": 0.7631, "grad_norm": 0.8407140970230103, "learning_rate": 0.0002, "epoch": 3.0870712401055407, "step": 2340}, {"loss": 0.7664, "grad_norm": 0.767779529094696, "learning_rate": 0.0002, "epoch": 3.100263852242744, "step": 2350}, {"loss": 0.7121, "grad_norm": 0.5572896599769592, "learning_rate": 0.0002, "epoch": 3.113456464379947, "step": 2360}, {"loss": 0.7419, "grad_norm": 0.5908368825912476, "learning_rate": 0.0002, "epoch": 3.1266490765171504, "step": 2370}, {"loss": 0.8024, "grad_norm": 0.8047826290130615, "learning_rate": 0.0002, "epoch": 3.1398416886543536, "step": 2380}, {"loss": 0.8686, "grad_norm": 0.8041718006134033, "learning_rate": 0.0002, "epoch": 3.153034300791557, "step": 2390}, {"loss": 0.668, "grad_norm": 0.57078617811203, "learning_rate": 0.0002, "epoch": 3.16622691292876, "step": 2400}, {"loss": 0.7976, "grad_norm": 0.5125322937965393, "learning_rate": 0.0002, "epoch": 3.179419525065963, "step": 2410}, {"loss": 0.741, "grad_norm": 0.6356934309005737, "learning_rate": 0.0002, "epoch": 3.192612137203166, "step": 2420}, {"loss": 0.687, "grad_norm": 1.0129680633544922, "learning_rate": 0.0002, "epoch": 3.2058047493403694, "step": 2430}, {"loss": 0.8316, "grad_norm": 0.8104226589202881, "learning_rate": 0.0002, "epoch": 3.2189973614775726, "step": 2440}, {"loss": 0.8343, "grad_norm": 0.7276079058647156, "learning_rate": 0.0002, "epoch": 3.232189973614776, "step": 2450}, {"loss": 0.8183, "grad_norm": 0.9753884077072144, "learning_rate": 0.0002, "epoch": 3.2453825857519787, "step": 2460}, {"loss": 0.7776, "grad_norm": 0.9753183722496033, "learning_rate": 0.0002, "epoch": 3.258575197889182, "step": 2470}, {"loss": 0.8815, "grad_norm": 0.6791225075721741, "learning_rate": 0.0002, "epoch": 3.271767810026385, "step": 2480}, {"loss": 0.7548, "grad_norm": 0.6797150373458862, "learning_rate": 0.0002, "epoch": 3.2849604221635884, "step": 2490}, {"loss": 0.8395, "grad_norm": 0.8107194900512695, "learning_rate": 0.0002, "epoch": 3.2981530343007917, "step": 2500}, {"loss": 0.7869, "grad_norm": 0.5878375172615051, "learning_rate": 0.0002, "epoch": 3.311345646437995, "step": 2510}, {"loss": 0.7992, "grad_norm": 0.5882975459098816, "learning_rate": 0.0002, "epoch": 3.324538258575198, "step": 2520}, {"loss": 0.7472, "grad_norm": 0.6180013418197632, "learning_rate": 0.0002, "epoch": 3.337730870712401, "step": 2530}, {"loss": 0.8033, "grad_norm": 1.0008151531219482, "learning_rate": 0.0002, "epoch": 3.350923482849604, "step": 2540}, {"loss": 0.8464, "grad_norm": 0.6404656767845154, "learning_rate": 0.0002, "epoch": 3.3641160949868074, "step": 2550}, {"loss": 0.7533, "grad_norm": 0.8481354117393494, "learning_rate": 0.0002, "epoch": 3.3773087071240107, "step": 2560}, {"loss": 0.7852, "grad_norm": 0.8068035244941711, "learning_rate": 0.0002, "epoch": 3.390501319261214, "step": 2570}, {"loss": 0.8621, "grad_norm": 0.7477166056632996, "learning_rate": 0.0002, "epoch": 3.4036939313984167, "step": 2580}, {"loss": 0.8352, "grad_norm": 0.6202635765075684, "learning_rate": 0.0002, "epoch": 3.41688654353562, "step": 2590}, {"loss": 0.7572, "grad_norm": 0.6981159448623657, "learning_rate": 0.0002, "epoch": 3.430079155672823, "step": 2600}, {"loss": 0.7846, "grad_norm": 0.6611084342002869, "learning_rate": 0.0002, "epoch": 3.4432717678100264, "step": 2610}, {"loss": 0.7503, "grad_norm": 0.5727696418762207, "learning_rate": 0.0002, "epoch": 3.4564643799472297, "step": 2620}, {"loss": 0.8427, "grad_norm": 1.2354545593261719, "learning_rate": 0.0002, "epoch": 3.469656992084433, "step": 2630}, {"loss": 0.7747, "grad_norm": 0.6347638368606567, "learning_rate": 0.0002, "epoch": 3.4828496042216357, "step": 2640}, {"loss": 0.8426, "grad_norm": 0.6975704431533813, "learning_rate": 0.0002, "epoch": 3.496042216358839, "step": 2650}, {"loss": 0.8773, "grad_norm": 0.6569573879241943, "learning_rate": 0.0002, "epoch": 3.509234828496042, "step": 2660}, {"loss": 0.7908, "grad_norm": 0.6979609131813049, "learning_rate": 0.0002, "epoch": 3.5224274406332454, "step": 2670}, {"loss": 0.8254, "grad_norm": 0.6287988424301147, "learning_rate": 0.0002, "epoch": 3.5356200527704487, "step": 2680}, {"loss": 0.7815, "grad_norm": 0.8682637214660645, "learning_rate": 0.0002, "epoch": 3.5488126649076515, "step": 2690}, {"loss": 0.7566, "grad_norm": 0.7062831521034241, "learning_rate": 0.0002, "epoch": 3.5620052770448547, "step": 2700}, {"loss": 0.713, "grad_norm": 1.0061452388763428, "learning_rate": 0.0002, "epoch": 3.575197889182058, "step": 2710}, {"loss": 0.7738, "grad_norm": 0.719097375869751, "learning_rate": 0.0002, "epoch": 3.588390501319261, "step": 2720}, {"loss": 0.8145, "grad_norm": 0.7583496570587158, "learning_rate": 0.0002, "epoch": 3.6015831134564644, "step": 2730}, {"loss": 0.91, "grad_norm": 0.7543531060218811, "learning_rate": 0.0002, "epoch": 3.6147757255936677, "step": 2740}, {"loss": 0.8325, "grad_norm": 0.8873646855354309, "learning_rate": 0.0002, "epoch": 3.627968337730871, "step": 2750}, {"loss": 0.7116, "grad_norm": 1.0657562017440796, "learning_rate": 0.0002, "epoch": 3.641160949868074, "step": 2760}, {"loss": 0.8291, "grad_norm": 0.8641113638877869, "learning_rate": 0.0002, "epoch": 3.654353562005277, "step": 2770}, {"loss": 0.8302, "grad_norm": 0.6620645523071289, "learning_rate": 0.0002, "epoch": 3.66754617414248, "step": 2780}, {"loss": 0.8261, "grad_norm": 0.6919541954994202, "learning_rate": 0.0002, "epoch": 3.6807387862796834, "step": 2790}, {"loss": 0.8388, "grad_norm": 0.7305743098258972, "learning_rate": 0.0002, "epoch": 3.6939313984168867, "step": 2800}, {"loss": 0.8053, "grad_norm": 0.7464777827262878, "learning_rate": 0.0002, "epoch": 3.7071240105540895, "step": 2810}, {"loss": 0.8019, "grad_norm": 0.8067063093185425, "learning_rate": 0.0002, "epoch": 3.7203166226912927, "step": 2820}, {"loss": 0.8259, "grad_norm": 0.7789416313171387, "learning_rate": 0.0002, "epoch": 3.733509234828496, "step": 2830}, {"loss": 0.774, "grad_norm": 0.507529079914093, "learning_rate": 0.0002, "epoch": 3.746701846965699, "step": 2840}, {"loss": 0.832, "grad_norm": 0.6509260535240173, "learning_rate": 0.0002, "epoch": 3.7598944591029024, "step": 2850}, {"loss": 0.8257, "grad_norm": 0.9141367673873901, "learning_rate": 0.0002, "epoch": 3.7730870712401057, "step": 2860}, {"loss": 0.9436, "grad_norm": 0.7852635979652405, "learning_rate": 0.0002, "epoch": 3.786279683377309, "step": 2870}, {"loss": 0.8842, "grad_norm": 0.5340318083763123, "learning_rate": 0.0002, "epoch": 3.7994722955145117, "step": 2880}, {"loss": 0.7468, "grad_norm": 0.6246042847633362, "learning_rate": 0.0002, "epoch": 3.812664907651715, "step": 2890}, {"loss": 0.8184, "grad_norm": 0.7064066529273987, "learning_rate": 0.0002, "epoch": 3.825857519788918, "step": 2900}, {"loss": 0.8515, "grad_norm": 0.6144065856933594, "learning_rate": 0.0002, "epoch": 3.8390501319261214, "step": 2910}, {"loss": 0.7484, "grad_norm": 0.5268424153327942, "learning_rate": 0.0002, "epoch": 3.8522427440633247, "step": 2920}, {"loss": 0.7594, "grad_norm": 0.9508116841316223, "learning_rate": 0.0002, "epoch": 3.8654353562005275, "step": 2930}, {"loss": 0.8437, "grad_norm": 0.9133715629577637, "learning_rate": 0.0002, "epoch": 3.8786279683377307, "step": 2940}, {"loss": 0.8611, "grad_norm": 1.0144646167755127, "learning_rate": 0.0002, "epoch": 3.891820580474934, "step": 2950}, {"loss": 0.8043, "grad_norm": 0.6397877931594849, "learning_rate": 0.0002, "epoch": 3.905013192612137, "step": 2960}, {"loss": 0.8285, "grad_norm": 0.734835147857666, "learning_rate": 0.0002, "epoch": 3.9182058047493404, "step": 2970}, {"loss": 0.7831, "grad_norm": 0.784853994846344, "learning_rate": 0.0002, "epoch": 3.9313984168865437, "step": 2980}, {"loss": 0.8148, "grad_norm": 0.805831789970398, "learning_rate": 0.0002, "epoch": 3.944591029023747, "step": 2990}, {"loss": 0.8252, "grad_norm": 0.6299595236778259, "learning_rate": 0.0002, "epoch": 3.9577836411609497, "step": 3000}, {"loss": 0.8244, "grad_norm": 0.6264058351516724, "learning_rate": 0.0002, "epoch": 3.970976253298153, "step": 3010}, {"loss": 0.8185, "grad_norm": 0.6419739723205566, "learning_rate": 0.0002, "epoch": 3.984168865435356, "step": 3020}, {"loss": 0.8174, "grad_norm": 0.7737036943435669, "learning_rate": 0.0002, "epoch": 3.9973614775725594, "step": 3030}, {"eval_loss": 1.2454297542572021, "eval_runtime": 71.8558, "eval_samples_per_second": 5.998, "eval_steps_per_second": 0.752, "epoch": 4.0, "step": 3032}, {"loss": 0.6716, "grad_norm": 1.092727541923523, "learning_rate": 0.0002, "epoch": 4.010554089709762, "step": 3040}, {"loss": 0.596, "grad_norm": 0.8087759613990784, "learning_rate": 0.0002, "epoch": 4.0237467018469655, "step": 3050}, {"loss": 0.7055, "grad_norm": 0.8106053471565247, "learning_rate": 0.0002, "epoch": 4.036939313984169, "step": 3060}, {"loss": 0.6846, "grad_norm": 0.8675326704978943, "learning_rate": 0.0002, "epoch": 4.050131926121372, "step": 3070}, {"loss": 0.6064, "grad_norm": 0.9620490074157715, "learning_rate": 0.0002, "epoch": 4.063324538258575, "step": 3080}, {"loss": 0.6047, "grad_norm": 0.8996296525001526, "learning_rate": 0.0002, "epoch": 4.076517150395778, "step": 3090}, {"loss": 0.6111, "grad_norm": 0.8648998737335205, "learning_rate": 0.0002, "epoch": 4.089709762532982, "step": 3100}, {"loss": 0.5853, "grad_norm": 1.0321335792541504, "learning_rate": 0.0002, "epoch": 4.102902374670185, "step": 3110}, {"loss": 0.6161, "grad_norm": 0.7949225306510925, "learning_rate": 0.0002, "epoch": 4.116094986807388, "step": 3120}, {"loss": 0.6354, "grad_norm": 0.9684646129608154, "learning_rate": 0.0002, "epoch": 4.129287598944591, "step": 3130}, {"loss": 0.6198, "grad_norm": 0.8698066473007202, "learning_rate": 0.0002, "epoch": 4.142480211081795, "step": 3140}, {"loss": 0.7185, "grad_norm": 0.7688450813293457, "learning_rate": 0.0002, "epoch": 4.155672823218997, "step": 3150}, {"loss": 0.6053, "grad_norm": 0.9682092070579529, "learning_rate": 0.0002, "epoch": 4.1688654353562, "step": 3160}, {"loss": 0.6827, "grad_norm": 0.961561918258667, "learning_rate": 0.0002, "epoch": 4.1820580474934035, "step": 3170}, {"loss": 0.6403, "grad_norm": 1.3962990045547485, "learning_rate": 0.0002, "epoch": 4.195250659630607, "step": 3180}, {"loss": 0.6319, "grad_norm": 0.9485045075416565, "learning_rate": 0.0002, "epoch": 4.20844327176781, "step": 3190}, {"loss": 0.5908, "grad_norm": 0.7768281698226929, "learning_rate": 0.0002, "epoch": 4.221635883905013, "step": 3200}, {"loss": 0.6365, "grad_norm": 1.2685691118240356, "learning_rate": 0.0002, "epoch": 4.2348284960422165, "step": 3210}, {"loss": 0.6601, "grad_norm": 0.6876471638679504, "learning_rate": 0.0002, "epoch": 4.24802110817942, "step": 3220}, {"loss": 0.6274, "grad_norm": 1.0074554681777954, "learning_rate": 0.0002, "epoch": 4.261213720316623, "step": 3230}, {"loss": 0.6027, "grad_norm": 0.8094777464866638, "learning_rate": 0.0002, "epoch": 4.274406332453826, "step": 3240}, {"loss": 0.643, "grad_norm": 0.7906569242477417, "learning_rate": 0.0002, "epoch": 4.287598944591029, "step": 3250}, {"loss": 0.5909, "grad_norm": 0.840238630771637, "learning_rate": 0.0002, "epoch": 4.300791556728232, "step": 3260}, {"loss": 0.5943, "grad_norm": 1.0119295120239258, "learning_rate": 0.0002, "epoch": 4.313984168865435, "step": 3270}, {"loss": 0.5912, "grad_norm": 0.7943191528320312, "learning_rate": 0.0002, "epoch": 4.327176781002638, "step": 3280}, {"loss": 0.6235, "grad_norm": 0.7691723704338074, "learning_rate": 0.0002, "epoch": 4.3403693931398415, "step": 3290}, {"loss": 0.6173, "grad_norm": 0.7227770686149597, "learning_rate": 0.0002, "epoch": 4.353562005277045, "step": 3300}, {"loss": 0.6047, "grad_norm": 0.8512253165245056, "learning_rate": 0.0002, "epoch": 4.366754617414248, "step": 3310}, {"loss": 0.5849, "grad_norm": 0.7852529287338257, "learning_rate": 0.0002, "epoch": 4.379947229551451, "step": 3320}, {"loss": 0.6416, "grad_norm": 0.8888797163963318, "learning_rate": 0.0002, "epoch": 4.3931398416886545, "step": 3330}, {"loss": 0.6804, "grad_norm": 0.9522430896759033, "learning_rate": 0.0002, "epoch": 4.406332453825858, "step": 3340}, {"loss": 0.6345, "grad_norm": 0.900276780128479, "learning_rate": 0.0002, "epoch": 4.419525065963061, "step": 3350}, {"loss": 0.7055, "grad_norm": 1.181547999382019, "learning_rate": 0.0002, "epoch": 4.432717678100264, "step": 3360}, {"loss": 0.7073, "grad_norm": 0.903142511844635, "learning_rate": 0.0002, "epoch": 4.445910290237467, "step": 3370}, {"loss": 0.7235, "grad_norm": 0.8747565150260925, "learning_rate": 0.0002, "epoch": 4.45910290237467, "step": 3380}, {"loss": 0.7071, "grad_norm": 0.7838051319122314, "learning_rate": 0.0002, "epoch": 4.472295514511873, "step": 3390}, {"loss": 0.5932, "grad_norm": 0.8691313862800598, "learning_rate": 0.0002, "epoch": 4.485488126649076, "step": 3400}, {"loss": 0.7019, "grad_norm": 0.8493868708610535, "learning_rate": 0.0002, "epoch": 4.4986807387862795, "step": 3410}, {"loss": 0.5959, "grad_norm": 1.0104830265045166, "learning_rate": 0.0002, "epoch": 4.511873350923483, "step": 3420}, {"loss": 0.6662, "grad_norm": 1.1716967821121216, "learning_rate": 0.0002, "epoch": 4.525065963060686, "step": 3430}, {"loss": 0.6411, "grad_norm": 0.9122593998908997, "learning_rate": 0.0002, "epoch": 4.538258575197889, "step": 3440}, {"loss": 0.7047, "grad_norm": 0.829090416431427, "learning_rate": 0.0002, "epoch": 4.5514511873350925, "step": 3450}, {"loss": 0.6001, "grad_norm": 1.141662836074829, "learning_rate": 0.0002, "epoch": 4.564643799472296, "step": 3460}, {"loss": 0.6612, "grad_norm": 0.8423182368278503, "learning_rate": 0.0002, "epoch": 4.577836411609499, "step": 3470}, {"loss": 0.6797, "grad_norm": 0.8024184703826904, "learning_rate": 0.0002, "epoch": 4.591029023746702, "step": 3480}, {"loss": 0.7184, "grad_norm": 0.7703381776809692, "learning_rate": 0.0002, "epoch": 4.6042216358839045, "step": 3490}, {"loss": 0.7001, "grad_norm": 0.9883959293365479, "learning_rate": 0.0002, "epoch": 4.617414248021108, "step": 3500}, {"loss": 0.6188, "grad_norm": 0.9554709196090698, "learning_rate": 0.0002, "epoch": 4.630606860158311, "step": 3510}, {"loss": 0.7378, "grad_norm": 1.9949709177017212, "learning_rate": 0.0002, "epoch": 4.643799472295514, "step": 3520}, {"loss": 0.6678, "grad_norm": 0.7762255072593689, "learning_rate": 0.0002, "epoch": 4.6569920844327175, "step": 3530}, {"loss": 0.6298, "grad_norm": 0.9538425803184509, "learning_rate": 0.0002, "epoch": 4.670184696569921, "step": 3540}, {"loss": 0.6352, "grad_norm": 1.0279661417007446, "learning_rate": 0.0002, "epoch": 4.683377308707124, "step": 3550}, {"loss": 0.6641, "grad_norm": 0.7545472979545593, "learning_rate": 0.0002, "epoch": 4.696569920844327, "step": 3560}, {"loss": 0.6887, "grad_norm": 0.8919376730918884, "learning_rate": 0.0002, "epoch": 4.7097625329815305, "step": 3570}, {"loss": 0.6395, "grad_norm": 0.7621569633483887, "learning_rate": 0.0002, "epoch": 4.722955145118734, "step": 3580}, {"loss": 0.6928, "grad_norm": 1.205320119857788, "learning_rate": 0.0002, "epoch": 4.736147757255937, "step": 3590}, {"loss": 0.6612, "grad_norm": 1.0642725229263306, "learning_rate": 0.0002, "epoch": 4.74934036939314, "step": 3600}, {"loss": 0.6541, "grad_norm": 0.9402666687965393, "learning_rate": 0.0002, "epoch": 4.762532981530343, "step": 3610}, {"loss": 0.6395, "grad_norm": 1.254127025604248, "learning_rate": 0.0002, "epoch": 4.775725593667546, "step": 3620}, {"loss": 0.692, "grad_norm": 0.7609598636627197, "learning_rate": 0.0002, "epoch": 4.788918205804749, "step": 3630}, {"loss": 0.6578, "grad_norm": 0.8240329623222351, "learning_rate": 0.0002, "epoch": 4.802110817941952, "step": 3640}, {"loss": 0.7383, "grad_norm": 0.8356260657310486, "learning_rate": 0.0002, "epoch": 4.8153034300791555, "step": 3650}, {"loss": 0.6368, "grad_norm": 0.9130708575248718, "learning_rate": 0.0002, "epoch": 4.828496042216359, "step": 3660}, {"loss": 0.7269, "grad_norm": 0.9384765028953552, "learning_rate": 0.0002, "epoch": 4.841688654353562, "step": 3670}, {"loss": 0.6509, "grad_norm": 0.9829966425895691, "learning_rate": 0.0002, "epoch": 4.854881266490765, "step": 3680}, {"loss": 0.6311, "grad_norm": 1.0488632917404175, "learning_rate": 0.0002, "epoch": 4.8680738786279685, "step": 3690}, {"loss": 0.7005, "grad_norm": 1.2278969287872314, "learning_rate": 0.0002, "epoch": 4.881266490765172, "step": 3700}, {"loss": 0.6869, "grad_norm": 0.8078970313072205, "learning_rate": 0.0002, "epoch": 4.894459102902375, "step": 3710}, {"loss": 0.6588, "grad_norm": 0.8081700205802917, "learning_rate": 0.0002, "epoch": 4.907651715039578, "step": 3720}, {"loss": 0.7189, "grad_norm": 0.9204511046409607, "learning_rate": 0.0002, "epoch": 4.9208443271767806, "step": 3730}, {"loss": 0.6953, "grad_norm": 0.9326391220092773, "learning_rate": 0.0002, "epoch": 4.934036939313984, "step": 3740}, {"loss": 0.68, "grad_norm": 1.0089969635009766, "learning_rate": 0.0002, "epoch": 4.947229551451187, "step": 3750}, {"loss": 0.7031, "grad_norm": 0.7063466906547546, "learning_rate": 0.0002, "epoch": 4.96042216358839, "step": 3760}, {"loss": 0.6568, "grad_norm": 1.2603905200958252, "learning_rate": 0.0002, "epoch": 4.9736147757255935, "step": 3770}, {"loss": 0.7134, "grad_norm": 0.8418653607368469, "learning_rate": 0.0002, "epoch": 4.986807387862797, "step": 3780}, {"loss": 0.6683, "grad_norm": 0.9537181854248047, "learning_rate": 0.0002, "epoch": 5.0, "step": 3790}, {"eval_loss": 1.3319307565689087, "eval_runtime": 71.7836, "eval_samples_per_second": 6.004, "eval_steps_per_second": 0.752, "epoch": 5.0, "step": 3790}, {"loss": 0.489, "grad_norm": 0.8595899343490601, "learning_rate": 0.0002, "epoch": 5.013192612137203, "step": 3800}, {"loss": 0.5155, "grad_norm": 1.0023565292358398, "learning_rate": 0.0002, "epoch": 5.0263852242744065, "step": 3810}, {"loss": 0.5321, "grad_norm": 1.2770460844039917, "learning_rate": 0.0002, "epoch": 5.03957783641161, "step": 3820}, {"loss": 0.5127, "grad_norm": 1.1701956987380981, "learning_rate": 0.0002, "epoch": 5.052770448548813, "step": 3830}, {"loss": 0.5057, "grad_norm": 0.812269926071167, "learning_rate": 0.0002, "epoch": 5.065963060686016, "step": 3840}, {"loss": 0.4292, "grad_norm": 0.8186697363853455, "learning_rate": 0.0002, "epoch": 5.0791556728232194, "step": 3850}, {"loss": 0.4865, "grad_norm": 1.052565097808838, "learning_rate": 0.0002, "epoch": 5.092348284960422, "step": 3860}, {"loss": 0.4947, "grad_norm": 0.9764705300331116, "learning_rate": 0.0002, "epoch": 5.105540897097625, "step": 3870}, {"loss": 0.471, "grad_norm": 0.6973426938056946, "learning_rate": 0.0002, "epoch": 5.118733509234828, "step": 3880}, {"loss": 0.5565, "grad_norm": 1.2127928733825684, "learning_rate": 0.0002, "epoch": 5.1319261213720315, "step": 3890}, {"loss": 0.4122, "grad_norm": 0.682807981967926, "learning_rate": 0.0002, "epoch": 5.145118733509235, "step": 3900}, {"loss": 0.6378, "grad_norm": 1.3575998544692993, "learning_rate": 0.0002, "epoch": 5.158311345646438, "step": 3910}, {"loss": 0.4624, "grad_norm": 1.2581931352615356, "learning_rate": 0.0002, "epoch": 5.171503957783641, "step": 3920}, {"loss": 0.5092, "grad_norm": 1.0493637323379517, "learning_rate": 0.0002, "epoch": 5.1846965699208445, "step": 3930}, {"loss": 0.4563, "grad_norm": 1.3519670963287354, "learning_rate": 0.0002, "epoch": 5.197889182058048, "step": 3940}, {"loss": 0.5414, "grad_norm": 1.0690566301345825, "learning_rate": 0.0002, "epoch": 5.211081794195251, "step": 3950}, {"loss": 0.5038, "grad_norm": 1.1171330213546753, "learning_rate": 0.0002, "epoch": 5.224274406332454, "step": 3960}, {"loss": 0.4397, "grad_norm": 1.055851697921753, "learning_rate": 0.0002, "epoch": 5.237467018469657, "step": 3970}, {"loss": 0.4964, "grad_norm": 0.8870180249214172, "learning_rate": 0.0002, "epoch": 5.25065963060686, "step": 3980}, {"loss": 0.5353, "grad_norm": 0.9688402414321899, "learning_rate": 0.0002, "epoch": 5.263852242744063, "step": 3990}, {"loss": 0.5192, "grad_norm": 0.8458422422409058, "learning_rate": 0.0002, "epoch": 5.277044854881266, "step": 4000}, {"loss": 0.5458, "grad_norm": 0.908256471157074, "learning_rate": 0.0002, "epoch": 5.2902374670184695, "step": 4010}, {"loss": 0.5102, "grad_norm": 1.0058149099349976, "learning_rate": 0.0002, "epoch": 5.303430079155673, "step": 4020}, {"loss": 0.5322, "grad_norm": 1.20364511013031, "learning_rate": 0.0002, "epoch": 5.316622691292876, "step": 4030}, {"loss": 0.5715, "grad_norm": 1.0135732889175415, "learning_rate": 0.0002, "epoch": 5.329815303430079, "step": 4040}, {"loss": 0.4736, "grad_norm": 1.1094907522201538, "learning_rate": 0.0002, "epoch": 5.3430079155672825, "step": 4050}, {"loss": 0.4912, "grad_norm": 1.0373083353042603, "learning_rate": 0.0002, "epoch": 5.356200527704486, "step": 4060}, {"loss": 0.5258, "grad_norm": 1.0952966213226318, "learning_rate": 0.0002, "epoch": 5.369393139841689, "step": 4070}, {"loss": 0.4892, "grad_norm": 1.1734952926635742, "learning_rate": 0.0002, "epoch": 5.382585751978892, "step": 4080}, {"loss": 0.4463, "grad_norm": 0.8217245936393738, "learning_rate": 0.0002, "epoch": 5.395778364116095, "step": 4090}, {"loss": 0.5271, "grad_norm": 1.0936307907104492, "learning_rate": 0.0002, "epoch": 5.408970976253298, "step": 4100}, {"loss": 0.509, "grad_norm": 1.0198720693588257, "learning_rate": 0.0002, "epoch": 5.422163588390501, "step": 4110}, {"loss": 0.5265, "grad_norm": 1.1105809211730957, "learning_rate": 0.0002, "epoch": 5.435356200527704, "step": 4120}, {"loss": 0.4871, "grad_norm": 1.1817213296890259, "learning_rate": 0.0002, "epoch": 5.4485488126649075, "step": 4130}, {"loss": 0.4987, "grad_norm": 1.126339077949524, "learning_rate": 0.0002, "epoch": 5.461741424802111, "step": 4140}, {"loss": 0.5743, "grad_norm": 0.9467914700508118, "learning_rate": 0.0002, "epoch": 5.474934036939314, "step": 4150}, {"loss": 0.5386, "grad_norm": 1.0335774421691895, "learning_rate": 0.0002, "epoch": 5.488126649076517, "step": 4160}, {"loss": 0.5122, "grad_norm": 0.866211473941803, "learning_rate": 0.0002, "epoch": 5.5013192612137205, "step": 4170}, {"loss": 0.5697, "grad_norm": 0.7422948479652405, "learning_rate": 0.0002, "epoch": 5.514511873350924, "step": 4180}, {"loss": 0.586, "grad_norm": 1.2211135625839233, "learning_rate": 0.0002, "epoch": 5.527704485488127, "step": 4190}, {"loss": 0.5476, "grad_norm": 1.0371766090393066, "learning_rate": 0.0002, "epoch": 5.540897097625329, "step": 4200}, {"loss": 0.5941, "grad_norm": 0.9460630416870117, "learning_rate": 0.0002, "epoch": 5.554089709762533, "step": 4210}, {"loss": 0.4645, "grad_norm": 0.7972197532653809, "learning_rate": 0.0002, "epoch": 5.567282321899736, "step": 4220}, {"loss": 0.5087, "grad_norm": 1.0654675960540771, "learning_rate": 0.0002, "epoch": 5.580474934036939, "step": 4230}, {"loss": 0.5957, "grad_norm": 1.0776735544204712, "learning_rate": 0.0002, "epoch": 5.593667546174142, "step": 4240}, {"loss": 0.53, "grad_norm": 1.498723030090332, "learning_rate": 0.0002, "epoch": 5.6068601583113455, "step": 4250}, {"loss": 0.4788, "grad_norm": 1.006768822669983, "learning_rate": 0.0002, "epoch": 5.620052770448549, "step": 4260}, {"loss": 0.5571, "grad_norm": 0.9194242358207703, "learning_rate": 0.0002, "epoch": 5.633245382585752, "step": 4270}, {"loss": 0.5722, "grad_norm": 1.1028380393981934, "learning_rate": 0.0002, "epoch": 5.646437994722955, "step": 4280}, {"loss": 0.5319, "grad_norm": 0.9972755312919617, "learning_rate": 0.0002, "epoch": 5.6596306068601585, "step": 4290}, {"loss": 0.53, "grad_norm": 1.0509438514709473, "learning_rate": 0.0002, "epoch": 5.672823218997362, "step": 4300}, {"loss": 0.4738, "grad_norm": 1.064039945602417, "learning_rate": 0.0002, "epoch": 5.686015831134565, "step": 4310}, {"loss": 0.5401, "grad_norm": 0.9572229981422424, "learning_rate": 0.0002, "epoch": 5.699208443271768, "step": 4320}, {"loss": 0.5173, "grad_norm": 0.9956564903259277, "learning_rate": 0.0002, "epoch": 5.7124010554089715, "step": 4330}, {"loss": 0.6008, "grad_norm": 1.01974618434906, "learning_rate": 0.0002, "epoch": 5.725593667546174, "step": 4340}, {"loss": 0.5111, "grad_norm": 1.101328730583191, "learning_rate": 0.0002, "epoch": 5.738786279683377, "step": 4350}, {"loss": 0.5921, "grad_norm": 0.9971756935119629, "learning_rate": 0.0002, "epoch": 5.75197889182058, "step": 4360}, {"loss": 0.5262, "grad_norm": 0.8579474687576294, "learning_rate": 0.0002, "epoch": 5.7651715039577835, "step": 4370}, {"loss": 0.5106, "grad_norm": 0.9927367568016052, "learning_rate": 0.0002, "epoch": 5.778364116094987, "step": 4380}, {"loss": 0.5354, "grad_norm": 1.1183884143829346, "learning_rate": 0.0002, "epoch": 5.79155672823219, "step": 4390}, {"loss": 0.5658, "grad_norm": 0.7695905566215515, "learning_rate": 0.0002, "epoch": 5.804749340369393, "step": 4400}, {"loss": 0.5137, "grad_norm": 1.1102122068405151, "learning_rate": 0.0002, "epoch": 5.8179419525065965, "step": 4410}, {"loss": 0.5634, "grad_norm": 1.3201336860656738, "learning_rate": 0.0002, "epoch": 5.8311345646438, "step": 4420}, {"loss": 0.5773, "grad_norm": 1.1934558153152466, "learning_rate": 0.0002, "epoch": 5.844327176781003, "step": 4430}, {"loss": 0.6338, "grad_norm": 1.390870451927185, "learning_rate": 0.0002, "epoch": 5.857519788918205, "step": 4440}, {"loss": 0.5625, "grad_norm": 1.056314468383789, "learning_rate": 0.0002, "epoch": 5.870712401055409, "step": 4450}, {"loss": 0.6456, "grad_norm": 0.9797437191009521, "learning_rate": 0.0002, "epoch": 5.883905013192612, "step": 4460}, {"loss": 0.5479, "grad_norm": 1.2368146181106567, "learning_rate": 0.0002, "epoch": 5.897097625329815, "step": 4470}, {"loss": 0.5453, "grad_norm": 0.9062654376029968, "learning_rate": 0.0002, "epoch": 5.910290237467018, "step": 4480}, {"loss": 0.5857, "grad_norm": 1.8643536567687988, "learning_rate": 0.0002, "epoch": 5.923482849604222, "step": 4490}, {"loss": 0.5858, "grad_norm": 1.2977997064590454, "learning_rate": 0.0002, "epoch": 5.936675461741425, "step": 4500}, {"loss": 0.4815, "grad_norm": 0.8366201519966125, "learning_rate": 0.0002, "epoch": 5.949868073878628, "step": 4510}, {"loss": 0.5126, "grad_norm": 1.0210131406784058, "learning_rate": 0.0002, "epoch": 5.963060686015831, "step": 4520}, {"loss": 0.5577, "grad_norm": 1.1287827491760254, "learning_rate": 0.0002, "epoch": 5.9762532981530345, "step": 4530}, {"loss": 0.5053, "grad_norm": 1.0480493307113647, "learning_rate": 0.0002, "epoch": 5.989445910290238, "step": 4540}, {"eval_loss": 1.450880765914917, "eval_runtime": 71.8135, "eval_samples_per_second": 6.002, "eval_steps_per_second": 0.752, "epoch": 6.0, "step": 4548}, {"loss": 0.5072, "grad_norm": 0.8589069247245789, "learning_rate": 0.0002, "epoch": 6.002638522427441, "step": 4550}, {"loss": 0.4129, "grad_norm": 1.467134714126587, "learning_rate": 0.0002, "epoch": 6.015831134564644, "step": 4560}, {"loss": 0.3739, "grad_norm": 1.1477625370025635, "learning_rate": 0.0002, "epoch": 6.029023746701847, "step": 4570}, {"loss": 0.3958, "grad_norm": 1.4254094362258911, "learning_rate": 0.0002, "epoch": 6.04221635883905, "step": 4580}, {"loss": 0.356, "grad_norm": 1.3656290769577026, "learning_rate": 0.0002, "epoch": 6.055408970976253, "step": 4590}, {"loss": 0.3626, "grad_norm": 0.9638674855232239, "learning_rate": 0.0002, "epoch": 6.068601583113456, "step": 4600}, {"loss": 0.3884, "grad_norm": 1.2654615640640259, "learning_rate": 0.0002, "epoch": 6.08179419525066, "step": 4610}, {"loss": 0.4659, "grad_norm": 1.4506969451904297, "learning_rate": 0.0002, "epoch": 6.094986807387863, "step": 4620}, {"loss": 0.3096, "grad_norm": 1.6596732139587402, "learning_rate": 0.0002, "epoch": 6.108179419525066, "step": 4630}, {"loss": 0.4005, "grad_norm": 1.5335280895233154, "learning_rate": 0.0002, "epoch": 6.121372031662269, "step": 4640}, {"loss": 0.3999, "grad_norm": 1.0815565586090088, "learning_rate": 0.0002, "epoch": 6.1345646437994725, "step": 4650}, {"loss": 0.4026, "grad_norm": 0.9995638132095337, "learning_rate": 0.0002, "epoch": 6.147757255936676, "step": 4660}, {"loss": 0.3548, "grad_norm": 0.8809106349945068, "learning_rate": 0.0002, "epoch": 6.160949868073879, "step": 4670}, {"loss": 0.4505, "grad_norm": 1.2946726083755493, "learning_rate": 0.0002, "epoch": 6.174142480211081, "step": 4680}, {"loss": 0.4447, "grad_norm": 1.311298131942749, "learning_rate": 0.0002, "epoch": 6.187335092348285, "step": 4690}, {"loss": 0.4108, "grad_norm": 1.229204535484314, "learning_rate": 0.0002, "epoch": 6.200527704485488, "step": 4700}, {"loss": 0.3764, "grad_norm": 1.0193822383880615, "learning_rate": 0.0002, "epoch": 6.213720316622691, "step": 4710}, {"loss": 0.3696, "grad_norm": 1.4438618421554565, "learning_rate": 0.0002, "epoch": 6.226912928759894, "step": 4720}, {"loss": 0.3979, "grad_norm": 1.4315637350082397, "learning_rate": 0.0002, "epoch": 6.240105540897098, "step": 4730}, {"loss": 0.4124, "grad_norm": 1.1291239261627197, "learning_rate": 0.0002, "epoch": 6.253298153034301, "step": 4740}, {"loss": 0.4337, "grad_norm": 0.9358022809028625, "learning_rate": 0.0002, "epoch": 6.266490765171504, "step": 4750}, {"loss": 0.3758, "grad_norm": 1.1260714530944824, "learning_rate": 0.0002, "epoch": 6.279683377308707, "step": 4760}, {"loss": 0.4262, "grad_norm": 1.5400320291519165, "learning_rate": 0.0002, "epoch": 6.2928759894459105, "step": 4770}, {"loss": 0.4105, "grad_norm": 1.6820714473724365, "learning_rate": 0.0002, "epoch": 6.306068601583114, "step": 4780}, {"loss": 0.4192, "grad_norm": 1.1937718391418457, "learning_rate": 0.0002, "epoch": 6.319261213720317, "step": 4790}, {"loss": 0.4519, "grad_norm": 1.4330145120620728, "learning_rate": 0.0002, "epoch": 6.33245382585752, "step": 4800}, {"loss": 0.4173, "grad_norm": 1.083373785018921, "learning_rate": 0.0002, "epoch": 6.345646437994723, "step": 4810}, {"loss": 0.4054, "grad_norm": 1.3013869524002075, "learning_rate": 0.0002, "epoch": 6.358839050131926, "step": 4820}, {"loss": 0.4177, "grad_norm": 1.1075547933578491, "learning_rate": 0.0002, "epoch": 6.372031662269129, "step": 4830}, {"loss": 0.3846, "grad_norm": 1.0480214357376099, "learning_rate": 0.0002, "epoch": 6.385224274406332, "step": 4840}, {"loss": 0.3924, "grad_norm": 1.3625658750534058, "learning_rate": 0.0002, "epoch": 6.398416886543536, "step": 4850}, {"loss": 0.3964, "grad_norm": 1.16606605052948, "learning_rate": 0.0002, "epoch": 6.411609498680739, "step": 4860}, {"loss": 0.4845, "grad_norm": 1.2435568571090698, "learning_rate": 0.0002, "epoch": 6.424802110817942, "step": 4870}, {"loss": 0.3847, "grad_norm": 1.4471954107284546, "learning_rate": 0.0002, "epoch": 6.437994722955145, "step": 4880}, {"loss": 0.443, "grad_norm": 1.2302275896072388, "learning_rate": 0.0002, "epoch": 6.4511873350923485, "step": 4890}, {"loss": 0.4458, "grad_norm": 1.2392226457595825, "learning_rate": 0.0002, "epoch": 6.464379947229552, "step": 4900}, {"loss": 0.4114, "grad_norm": 1.0497277975082397, "learning_rate": 0.0002, "epoch": 6.477572559366755, "step": 4910}, {"loss": 0.426, "grad_norm": 1.3509557247161865, "learning_rate": 0.0002, "epoch": 6.490765171503957, "step": 4920}, {"loss": 0.4089, "grad_norm": 1.340214729309082, "learning_rate": 0.0002, "epoch": 6.503957783641161, "step": 4930}, {"loss": 0.4655, "grad_norm": 1.283220648765564, "learning_rate": 0.0002, "epoch": 6.517150395778364, "step": 4940}, {"loss": 0.4205, "grad_norm": 1.0693278312683105, "learning_rate": 0.0002, "epoch": 6.530343007915567, "step": 4950}, {"loss": 0.398, "grad_norm": 1.307997226715088, "learning_rate": 0.0002, "epoch": 6.54353562005277, "step": 4960}, {"loss": 0.3844, "grad_norm": 1.1739027500152588, "learning_rate": 0.0002, "epoch": 6.556728232189974, "step": 4970}, {"loss": 0.4494, "grad_norm": 1.5694327354431152, "learning_rate": 0.0002, "epoch": 6.569920844327177, "step": 4980}, {"loss": 0.4535, "grad_norm": 0.9978346824645996, "learning_rate": 0.0002, "epoch": 6.58311345646438, "step": 4990}, {"loss": 0.4755, "grad_norm": 1.183057427406311, "learning_rate": 0.0002, "epoch": 6.596306068601583, "step": 5000}, {"loss": 0.4688, "grad_norm": 1.1033718585968018, "learning_rate": 0.0002, "epoch": 6.6094986807387865, "step": 5010}, {"loss": 0.4233, "grad_norm": 1.0699188709259033, "learning_rate": 0.0002, "epoch": 6.62269129287599, "step": 5020}, {"loss": 0.4049, "grad_norm": 1.491031289100647, "learning_rate": 0.0002, "epoch": 6.635883905013193, "step": 5030}, {"loss": 0.4257, "grad_norm": 0.7939618825912476, "learning_rate": 0.0002, "epoch": 6.649076517150396, "step": 5040}, {"loss": 0.4273, "grad_norm": 1.2883116006851196, "learning_rate": 0.0002, "epoch": 6.662269129287599, "step": 5050}, {"loss": 0.4376, "grad_norm": 1.3844388723373413, "learning_rate": 0.0002, "epoch": 6.675461741424802, "step": 5060}, {"loss": 0.4078, "grad_norm": 1.1823489665985107, "learning_rate": 0.0002, "epoch": 6.688654353562005, "step": 5070}, {"loss": 0.4811, "grad_norm": 1.310214638710022, "learning_rate": 0.0002, "epoch": 6.701846965699208, "step": 5080}, {"loss": 0.4675, "grad_norm": 1.6253955364227295, "learning_rate": 0.0002, "epoch": 6.715039577836412, "step": 5090}, {"loss": 0.4749, "grad_norm": 1.3344792127609253, "learning_rate": 0.0002, "epoch": 6.728232189973615, "step": 5100}, {"loss": 0.4051, "grad_norm": 1.3900614976882935, "learning_rate": 0.0002, "epoch": 6.741424802110818, "step": 5110}, {"loss": 0.3782, "grad_norm": 1.5122374296188354, "learning_rate": 0.0002, "epoch": 6.754617414248021, "step": 5120}, {"loss": 0.4439, "grad_norm": 1.4738229513168335, "learning_rate": 0.0002, "epoch": 6.7678100263852246, "step": 5130}, {"loss": 0.4237, "grad_norm": 1.0417664051055908, "learning_rate": 0.0002, "epoch": 6.781002638522428, "step": 5140}, {"loss": 0.486, "grad_norm": 1.1339401006698608, "learning_rate": 0.0002, "epoch": 6.79419525065963, "step": 5150}, {"loss": 0.4387, "grad_norm": 1.4377150535583496, "learning_rate": 0.0002, "epoch": 6.807387862796833, "step": 5160}, {"loss": 0.4375, "grad_norm": 1.3321975469589233, "learning_rate": 0.0002, "epoch": 6.820580474934037, "step": 5170}, {"loss": 0.4369, "grad_norm": 1.3799545764923096, "learning_rate": 0.0002, "epoch": 6.83377308707124, "step": 5180}, {"loss": 0.4266, "grad_norm": 0.864224374294281, "learning_rate": 0.0002, "epoch": 6.846965699208443, "step": 5190}, {"loss": 0.4455, "grad_norm": 1.0666139125823975, "learning_rate": 0.0002, "epoch": 6.860158311345646, "step": 5200}, {"loss": 0.4545, "grad_norm": 1.2926141023635864, "learning_rate": 0.0002, "epoch": 6.87335092348285, "step": 5210}, {"loss": 0.4441, "grad_norm": 1.2046207189559937, "learning_rate": 0.0002, "epoch": 6.886543535620053, "step": 5220}, {"loss": 0.4458, "grad_norm": 1.3961530923843384, "learning_rate": 0.0002, "epoch": 6.899736147757256, "step": 5230}, {"loss": 0.4343, "grad_norm": 1.1340336799621582, "learning_rate": 0.0002, "epoch": 6.912928759894459, "step": 5240}, {"loss": 0.4491, "grad_norm": 1.1756815910339355, "learning_rate": 0.0002, "epoch": 6.926121372031663, "step": 5250}, {"loss": 0.4077, "grad_norm": 1.146964192390442, "learning_rate": 0.0002, "epoch": 6.939313984168866, "step": 5260}, {"loss": 0.4232, "grad_norm": 1.2974623441696167, "learning_rate": 0.0002, "epoch": 6.952506596306069, "step": 5270}, {"loss": 0.4126, "grad_norm": 1.342126727104187, "learning_rate": 0.0002, "epoch": 6.965699208443271, "step": 5280}, {"loss": 0.4537, "grad_norm": 1.2475614547729492, "learning_rate": 0.0002, "epoch": 6.978891820580475, "step": 5290}, {"loss": 0.456, "grad_norm": 1.254935622215271, "learning_rate": 0.0002, "epoch": 6.992084432717678, "step": 5300}, {"eval_loss": 1.5579944849014282, "eval_runtime": 71.7131, "eval_samples_per_second": 6.01, "eval_steps_per_second": 0.753, "epoch": 7.0, "step": 5306}, {"loss": 0.3784, "grad_norm": 0.7949880361557007, "learning_rate": 0.0002, "epoch": 7.005277044854881, "step": 5310}, {"loss": 0.3216, "grad_norm": 2.0586414337158203, "learning_rate": 0.0002, "epoch": 7.018469656992084, "step": 5320}, {"loss": 0.3071, "grad_norm": 1.0757979154586792, "learning_rate": 0.0002, "epoch": 7.031662269129288, "step": 5330}, {"loss": 0.2836, "grad_norm": 0.9700984358787537, "learning_rate": 0.0002, "epoch": 7.044854881266491, "step": 5340}, {"loss": 0.2536, "grad_norm": 1.016965389251709, "learning_rate": 0.0002, "epoch": 7.058047493403694, "step": 5350}, {"loss": 0.3233, "grad_norm": 1.223994493484497, "learning_rate": 0.0002, "epoch": 7.071240105540897, "step": 5360}, {"loss": 0.3012, "grad_norm": 2.044800043106079, "learning_rate": 0.0002, "epoch": 7.084432717678101, "step": 5370}, {"loss": 0.304, "grad_norm": 1.1677180528640747, "learning_rate": 0.0002, "epoch": 7.097625329815304, "step": 5380}, {"loss": 0.3193, "grad_norm": 1.8017300367355347, "learning_rate": 0.0002, "epoch": 7.110817941952506, "step": 5390}, {"loss": 0.3322, "grad_norm": 1.1814491748809814, "learning_rate": 0.0002, "epoch": 7.124010554089709, "step": 5400}, {"loss": 0.3423, "grad_norm": 1.835221767425537, "learning_rate": 0.0002, "epoch": 7.137203166226913, "step": 5410}, {"loss": 0.3179, "grad_norm": 1.7413564920425415, "learning_rate": 0.0002, "epoch": 7.150395778364116, "step": 5420}, {"loss": 0.2946, "grad_norm": 1.4341952800750732, "learning_rate": 0.0002, "epoch": 7.163588390501319, "step": 5430}, {"loss": 0.3111, "grad_norm": 1.1618049144744873, "learning_rate": 0.0002, "epoch": 7.176781002638522, "step": 5440}, {"loss": 0.3326, "grad_norm": 1.2117347717285156, "learning_rate": 0.0002, "epoch": 7.189973614775726, "step": 5450}, {"loss": 0.3403, "grad_norm": 1.4826463460922241, "learning_rate": 0.0002, "epoch": 7.203166226912929, "step": 5460}, {"loss": 0.3087, "grad_norm": 1.112357497215271, "learning_rate": 0.0002, "epoch": 7.216358839050132, "step": 5470}, {"loss": 0.3162, "grad_norm": 1.1144609451293945, "learning_rate": 0.0002, "epoch": 7.229551451187335, "step": 5480}, {"loss": 0.3446, "grad_norm": 1.2441258430480957, "learning_rate": 0.0002, "epoch": 7.242744063324539, "step": 5490}, {"loss": 0.341, "grad_norm": 1.0532526969909668, "learning_rate": 0.0002, "epoch": 7.255936675461742, "step": 5500}, {"loss": 0.3251, "grad_norm": 1.4295402765274048, "learning_rate": 0.0002, "epoch": 7.269129287598945, "step": 5510}, {"loss": 0.3254, "grad_norm": 1.3890503644943237, "learning_rate": 0.0002, "epoch": 7.282321899736147, "step": 5520}, {"loss": 0.3459, "grad_norm": 0.919006884098053, "learning_rate": 0.0002, "epoch": 7.295514511873351, "step": 5530}, {"loss": 0.3313, "grad_norm": 1.2184085845947266, "learning_rate": 0.0002, "epoch": 7.308707124010554, "step": 5540}, {"loss": 0.3581, "grad_norm": 1.0661242008209229, "learning_rate": 0.0002, "epoch": 7.321899736147757, "step": 5550}, {"loss": 0.3211, "grad_norm": 1.331189751625061, "learning_rate": 0.0002, "epoch": 7.33509234828496, "step": 5560}, {"loss": 0.3303, "grad_norm": 1.1899065971374512, "learning_rate": 0.0002, "epoch": 7.348284960422164, "step": 5570}, {"loss": 0.3345, "grad_norm": 0.9958152174949646, "learning_rate": 0.0002, "epoch": 7.361477572559367, "step": 5580}, {"loss": 0.311, "grad_norm": 1.2326462268829346, "learning_rate": 0.0002, "epoch": 7.37467018469657, "step": 5590}, {"loss": 0.3459, "grad_norm": 1.4610025882720947, "learning_rate": 0.0002, "epoch": 7.387862796833773, "step": 5600}, {"loss": 0.3343, "grad_norm": 1.0228832960128784, "learning_rate": 0.0002, "epoch": 7.401055408970977, "step": 5610}, {"loss": 0.331, "grad_norm": 1.2726085186004639, "learning_rate": 0.0002, "epoch": 7.41424802110818, "step": 5620}, {"loss": 0.3156, "grad_norm": 1.1658830642700195, "learning_rate": 0.0002, "epoch": 7.427440633245382, "step": 5630}, {"loss": 0.3463, "grad_norm": 1.0791388750076294, "learning_rate": 0.0002, "epoch": 7.440633245382585, "step": 5640}, {"loss": 0.3457, "grad_norm": 1.4051549434661865, "learning_rate": 0.0002, "epoch": 7.453825857519789, "step": 5650}, {"loss": 0.3251, "grad_norm": 1.7039124965667725, "learning_rate": 0.0002, "epoch": 7.467018469656992, "step": 5660}, {"loss": 0.3655, "grad_norm": 1.5712453126907349, "learning_rate": 0.0002, "epoch": 7.480211081794195, "step": 5670}, {"loss": 0.3759, "grad_norm": 1.1755692958831787, "learning_rate": 0.0002, "epoch": 7.493403693931398, "step": 5680}, {"loss": 0.3212, "grad_norm": 0.7768910527229309, "learning_rate": 0.0002, "epoch": 7.506596306068602, "step": 5690}, {"loss": 0.3953, "grad_norm": 1.34855318069458, "learning_rate": 0.0002, "epoch": 7.519788918205805, "step": 5700}, {"loss": 0.3139, "grad_norm": 1.326443076133728, "learning_rate": 0.0002, "epoch": 7.532981530343008, "step": 5710}, {"loss": 0.3536, "grad_norm": 1.2597885131835938, "learning_rate": 0.0002, "epoch": 7.546174142480211, "step": 5720}, {"loss": 0.334, "grad_norm": 1.0863240957260132, "learning_rate": 0.0002, "epoch": 7.559366754617415, "step": 5730}, {"loss": 0.3408, "grad_norm": 1.2254612445831299, "learning_rate": 0.0002, "epoch": 7.572559366754618, "step": 5740}, {"loss": 0.3675, "grad_norm": 1.4157414436340332, "learning_rate": 0.0002, "epoch": 7.585751978891821, "step": 5750}, {"loss": 0.3843, "grad_norm": 1.1378470659255981, "learning_rate": 0.0002, "epoch": 7.598944591029023, "step": 5760}, {"loss": 0.3812, "grad_norm": 1.1139744520187378, "learning_rate": 0.0002, "epoch": 7.612137203166227, "step": 5770}, {"loss": 0.3238, "grad_norm": 1.3163728713989258, "learning_rate": 0.0002, "epoch": 7.62532981530343, "step": 5780}, {"loss": 0.3459, "grad_norm": 1.0113680362701416, "learning_rate": 0.0002, "epoch": 7.638522427440633, "step": 5790}, {"loss": 0.3554, "grad_norm": 0.918424665927887, "learning_rate": 0.0002, "epoch": 7.651715039577836, "step": 5800}, {"loss": 0.3949, "grad_norm": 1.1702263355255127, "learning_rate": 0.0002, "epoch": 7.66490765171504, "step": 5810}, {"loss": 0.3378, "grad_norm": 1.4807580709457397, "learning_rate": 0.0002, "epoch": 7.678100263852243, "step": 5820}, {"loss": 0.3677, "grad_norm": 1.0703623294830322, "learning_rate": 0.0002, "epoch": 7.691292875989446, "step": 5830}, {"loss": 0.3524, "grad_norm": 1.2308809757232666, "learning_rate": 0.0002, "epoch": 7.704485488126649, "step": 5840}, {"loss": 0.3326, "grad_norm": 1.212863564491272, "learning_rate": 0.0002, "epoch": 7.717678100263853, "step": 5850}, {"loss": 0.3909, "grad_norm": 1.0400227308273315, "learning_rate": 0.0002, "epoch": 7.730870712401055, "step": 5860}, {"loss": 0.3312, "grad_norm": 1.2876183986663818, "learning_rate": 0.0002, "epoch": 7.744063324538258, "step": 5870}, {"loss": 0.3149, "grad_norm": 1.0517319440841675, "learning_rate": 0.0002, "epoch": 7.757255936675461, "step": 5880}, {"loss": 0.3777, "grad_norm": 1.091901183128357, "learning_rate": 0.0002, "epoch": 7.770448548812665, "step": 5890}, {"loss": 0.3959, "grad_norm": 1.3892148733139038, "learning_rate": 0.0002, "epoch": 7.783641160949868, "step": 5900}, {"loss": 0.3991, "grad_norm": 1.4618996381759644, "learning_rate": 0.0002, "epoch": 7.796833773087071, "step": 5910}, {"loss": 0.3611, "grad_norm": 1.3962730169296265, "learning_rate": 0.0002, "epoch": 7.810026385224274, "step": 5920}, {"loss": 0.3597, "grad_norm": 1.249474048614502, "learning_rate": 0.0002, "epoch": 7.823218997361478, "step": 5930}, {"loss": 0.3604, "grad_norm": 1.3841967582702637, "learning_rate": 0.0002, "epoch": 7.836411609498681, "step": 5940}, {"loss": 0.3417, "grad_norm": 1.2477777004241943, "learning_rate": 0.0002, "epoch": 7.849604221635884, "step": 5950}, {"loss": 0.3568, "grad_norm": 1.3400548696517944, "learning_rate": 0.0002, "epoch": 7.862796833773087, "step": 5960}, {"loss": 0.372, "grad_norm": 1.383649468421936, "learning_rate": 0.0002, "epoch": 7.875989445910291, "step": 5970}, {"loss": 0.3554, "grad_norm": 1.124591588973999, "learning_rate": 0.0002, "epoch": 7.889182058047494, "step": 5980}, {"loss": 0.3458, "grad_norm": 1.2731496095657349, "learning_rate": 0.0002, "epoch": 7.902374670184696, "step": 5990}, {"loss": 0.3558, "grad_norm": 1.61614990234375, "learning_rate": 0.0002, "epoch": 7.915567282321899, "step": 6000}, {"loss": 0.35, "grad_norm": 1.0083316564559937, "learning_rate": 0.0002, "epoch": 7.928759894459103, "step": 6010}, {"loss": 0.3657, "grad_norm": 1.3074530363082886, "learning_rate": 0.0002, "epoch": 7.941952506596306, "step": 6020}, {"loss": 0.3872, "grad_norm": 1.3631811141967773, "learning_rate": 0.0002, "epoch": 7.955145118733509, "step": 6030}, {"loss": 0.3371, "grad_norm": 1.3127434253692627, "learning_rate": 0.0002, "epoch": 7.968337730870712, "step": 6040}, {"loss": 0.3446, "grad_norm": 1.6356911659240723, "learning_rate": 0.0002, "epoch": 7.981530343007916, "step": 6050}, {"loss": 0.3513, "grad_norm": 1.2134562730789185, "learning_rate": 0.0002, "epoch": 7.994722955145119, "step": 6060}]}