diff --git a/.gitattributes b/.gitattributes index 8ae69a34631e9707590877326f7b10eb92b77fff..35d8c15d98613015b1d6248be28832d8a5c1edeb 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2362,3 +2362,12 @@ gemma-2-9b-it_int4_flare-en-fpb_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1 gemma-2-9b-it_int4_flare-en-fpb_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-1860-sd-42/checkpoint-812/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_flare-en-fpb_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-1860-sd-42/checkpoint-928/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_flare-en-fpb_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-1860-sd-42/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5cde1a718c487f3aed9dc296351828322488db0b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79a867fb634fd8e3b066d1e5dba2fcd82385843220a23a334f7884c4c27c8b96 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5cde1a718c487f3aed9dc296351828322488db0b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79a867fb634fd8e3b066d1e5dba2fcd82385843220a23a334f7884c4c27c8b96 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3939ef6796641269f6ace9087d94bbd3646c41a --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4c65999d237dae4424b1944a70de5680c46f42713e3f2c8537a7c2ab85c5104 +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1d1cb00e182f171d46f445cd099268e812deda8e --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49e211764da55bdbda0c95564a513ca390fea4afa7b89981b7c84330cfeb75ce +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3d5fc807a13a3f6eccccd8ac1c5a837c97f3ef2 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38841519e925109736ce493a81a32baf57374f07cc4493ecbc110955926f5fa2 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dd4f57b1018a848506eaaaf6dc64a8c5cadaffce --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/trainer_state.json @@ -0,0 +1,1323 @@ +{ + "best_metric": 1.14472496509552, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 1827, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.010946907498631636, + "grad_norm": 0.7611560821533203, + "learning_rate": 0.0002, + "loss": 2.4039, + "step": 10 + }, + { + "epoch": 0.021893814997263273, + "grad_norm": 0.4633193612098694, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 20 + }, + { + "epoch": 0.03284072249589491, + "grad_norm": 0.49326154589653015, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 30 + }, + { + "epoch": 0.043787629994526546, + "grad_norm": 0.3943138122558594, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 40 + }, + { + "epoch": 0.05473453749315818, + "grad_norm": 0.43292930722236633, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 50 + }, + { + "epoch": 0.06568144499178982, + "grad_norm": 0.3431817591190338, + "learning_rate": 0.0002, + "loss": 1.2427, + "step": 60 + }, + { + "epoch": 0.07662835249042145, + "grad_norm": 0.38011446595191956, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 70 + }, + { + "epoch": 0.08757525998905309, + "grad_norm": 0.7441071271896362, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 80 + }, + { + "epoch": 0.09852216748768473, + "grad_norm": 0.3359833061695099, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 90 + }, + { + "epoch": 0.10946907498631636, + "grad_norm": 0.3724392354488373, + "learning_rate": 0.0002, + "loss": 1.2774, + "step": 100 + }, + { + "epoch": 0.120415982484948, + "grad_norm": 0.40673762559890747, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 110 + }, + { + "epoch": 0.13136288998357964, + "grad_norm": 0.40036800503730774, + "learning_rate": 0.0002, + "loss": 1.2611, + "step": 120 + }, + { + "epoch": 0.1423097974822113, + "grad_norm": 2.844191312789917, + "learning_rate": 0.0002, + "loss": 1.2436, + "step": 130 + }, + { + "epoch": 0.1532567049808429, + "grad_norm": 0.3104734420776367, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 140 + }, + { + "epoch": 0.16420361247947454, + "grad_norm": 0.3266797959804535, + "learning_rate": 0.0002, + "loss": 1.0498, + "step": 150 + }, + { + "epoch": 0.17515051997810618, + "grad_norm": 0.3079199194908142, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 160 + }, + { + "epoch": 0.18609742747673783, + "grad_norm": 0.3872479498386383, + "learning_rate": 0.0002, + "loss": 1.2923, + "step": 170 + }, + { + "epoch": 0.19704433497536947, + "grad_norm": 0.38654500246047974, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 180 + }, + { + "epoch": 0.20799124247400108, + "grad_norm": 0.2913552522659302, + "learning_rate": 0.0002, + "loss": 1.066, + "step": 190 + }, + { + "epoch": 0.21893814997263272, + "grad_norm": 0.2960572838783264, + "learning_rate": 0.0002, + "loss": 1.0868, + "step": 200 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 0.5175501108169556, + "learning_rate": 0.0002, + "loss": 1.136, + "step": 210 + }, + { + "epoch": 0.240831964969896, + "grad_norm": 1.2921574115753174, + "learning_rate": 0.0002, + "loss": 1.1556, + "step": 220 + }, + { + "epoch": 0.25177887246852765, + "grad_norm": 0.2675512135028839, + "learning_rate": 0.0002, + "loss": 1.1488, + "step": 230 + }, + { + "epoch": 0.2627257799671593, + "grad_norm": 0.3956190049648285, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 240 + }, + { + "epoch": 0.27367268746579093, + "grad_norm": 0.6022581458091736, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 250 + }, + { + "epoch": 0.2846195949644226, + "grad_norm": 1.1949563026428223, + "learning_rate": 0.0002, + "loss": 1.1981, + "step": 260 + }, + { + "epoch": 0.2955665024630542, + "grad_norm": 0.31173548102378845, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 270 + }, + { + "epoch": 0.3065134099616858, + "grad_norm": 0.2808472812175751, + "learning_rate": 0.0002, + "loss": 1.06, + "step": 280 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 0.3042023777961731, + "learning_rate": 0.0002, + "loss": 1.1752, + "step": 290 + }, + { + "epoch": 0.3284072249589491, + "grad_norm": 0.39915043115615845, + "learning_rate": 0.0002, + "loss": 1.3147, + "step": 300 + }, + { + "epoch": 0.3393541324575807, + "grad_norm": 0.39118197560310364, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 310 + }, + { + "epoch": 0.35030103995621237, + "grad_norm": 0.355010986328125, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 320 + }, + { + "epoch": 0.361247947454844, + "grad_norm": 0.29734086990356445, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 330 + }, + { + "epoch": 0.37219485495347565, + "grad_norm": 0.346096009016037, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 340 + }, + { + "epoch": 0.3831417624521073, + "grad_norm": 0.4829643666744232, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 350 + }, + { + "epoch": 0.39408866995073893, + "grad_norm": 0.4726872742176056, + "learning_rate": 0.0002, + "loss": 1.2808, + "step": 360 + }, + { + "epoch": 0.4050355774493706, + "grad_norm": 0.3130153715610504, + "learning_rate": 0.0002, + "loss": 1.1532, + "step": 370 + }, + { + "epoch": 0.41598248494800216, + "grad_norm": 0.5123590230941772, + "learning_rate": 0.0002, + "loss": 1.1842, + "step": 380 + }, + { + "epoch": 0.4269293924466338, + "grad_norm": 0.3444574773311615, + "learning_rate": 0.0002, + "loss": 1.1539, + "step": 390 + }, + { + "epoch": 0.43787629994526545, + "grad_norm": 0.5302175283432007, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 400 + }, + { + "epoch": 0.4488232074438971, + "grad_norm": 0.2713572680950165, + "learning_rate": 0.0002, + "loss": 1.1138, + "step": 410 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 0.29530611634254456, + "learning_rate": 0.0002, + "loss": 1.1281, + "step": 420 + }, + { + "epoch": 0.47071702244116037, + "grad_norm": 0.27282455563545227, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 430 + }, + { + "epoch": 0.481663929939792, + "grad_norm": 0.2647949755191803, + "learning_rate": 0.0002, + "loss": 1.1213, + "step": 440 + }, + { + "epoch": 0.49261083743842365, + "grad_norm": 0.35509347915649414, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 450 + }, + { + "epoch": 0.5035577449370553, + "grad_norm": 0.1959609091281891, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 460 + }, + { + "epoch": 0.5145046524356869, + "grad_norm": 0.40090155601501465, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 470 + }, + { + "epoch": 0.5254515599343186, + "grad_norm": 0.3354604244232178, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 480 + }, + { + "epoch": 0.5363984674329502, + "grad_norm": 0.2758506238460541, + "learning_rate": 0.0002, + "loss": 1.1024, + "step": 490 + }, + { + "epoch": 0.5473453749315819, + "grad_norm": 0.3629051744937897, + "learning_rate": 0.0002, + "loss": 1.1108, + "step": 500 + }, + { + "epoch": 0.5582922824302134, + "grad_norm": 0.30802229046821594, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 510 + }, + { + "epoch": 0.5692391899288451, + "grad_norm": 0.3099463880062103, + "learning_rate": 0.0002, + "loss": 1.0424, + "step": 520 + }, + { + "epoch": 0.5801860974274767, + "grad_norm": 0.42299067974090576, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 530 + }, + { + "epoch": 0.5911330049261084, + "grad_norm": 0.5392252802848816, + "learning_rate": 0.0002, + "loss": 1.1698, + "step": 540 + }, + { + "epoch": 0.60207991242474, + "grad_norm": 0.34768250584602356, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 550 + }, + { + "epoch": 0.6130268199233716, + "grad_norm": 0.28490015864372253, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 560 + }, + { + "epoch": 0.6239737274220033, + "grad_norm": 0.34787994623184204, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 570 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.29058772325515747, + "learning_rate": 0.0002, + "loss": 1.0759, + "step": 580 + }, + { + "epoch": 0.6458675424192666, + "grad_norm": 0.4063778817653656, + "learning_rate": 0.0002, + "loss": 1.1157, + "step": 590 + }, + { + "epoch": 0.6568144499178982, + "grad_norm": 0.9244267344474792, + "learning_rate": 0.0002, + "loss": 1.1432, + "step": 600 + }, + { + "epoch": 0.6677613574165299, + "grad_norm": 0.27605190873146057, + "learning_rate": 0.0002, + "loss": 1.0591, + "step": 610 + }, + { + "epoch": 0.6787082649151615, + "grad_norm": 0.34346821904182434, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 620 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.3093279302120209, + "learning_rate": 0.0002, + "loss": 1.2195, + "step": 630 + }, + { + "epoch": 0.7006020799124247, + "grad_norm": 1.0069009065628052, + "learning_rate": 0.0002, + "loss": 1.2461, + "step": 640 + }, + { + "epoch": 0.7115489874110563, + "grad_norm": 0.5049130916595459, + "learning_rate": 0.0002, + "loss": 1.0892, + "step": 650 + }, + { + "epoch": 0.722495894909688, + "grad_norm": 0.3748924732208252, + "learning_rate": 0.0002, + "loss": 1.1404, + "step": 660 + }, + { + "epoch": 0.7334428024083196, + "grad_norm": 0.2964959144592285, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 670 + }, + { + "epoch": 0.7443897099069513, + "grad_norm": 0.4599986970424652, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 680 + }, + { + "epoch": 0.7553366174055829, + "grad_norm": 0.27292951941490173, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 690 + }, + { + "epoch": 0.7662835249042146, + "grad_norm": 0.3123566806316376, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 700 + }, + { + "epoch": 0.7772304324028462, + "grad_norm": 0.28310710191726685, + "learning_rate": 0.0002, + "loss": 1.0021, + "step": 710 + }, + { + "epoch": 0.7881773399014779, + "grad_norm": 0.3279992341995239, + "learning_rate": 0.0002, + "loss": 1.2281, + "step": 720 + }, + { + "epoch": 0.7991242474001095, + "grad_norm": 0.28179168701171875, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 730 + }, + { + "epoch": 0.8100711548987412, + "grad_norm": 0.31492987275123596, + "learning_rate": 0.0002, + "loss": 1.0602, + "step": 740 + }, + { + "epoch": 0.8210180623973727, + "grad_norm": 0.41821011900901794, + "learning_rate": 0.0002, + "loss": 1.2518, + "step": 750 + }, + { + "epoch": 0.8319649698960043, + "grad_norm": 0.325235515832901, + "learning_rate": 0.0002, + "loss": 1.1612, + "step": 760 + }, + { + "epoch": 0.842911877394636, + "grad_norm": 0.5366070866584778, + "learning_rate": 0.0002, + "loss": 1.27, + "step": 770 + }, + { + "epoch": 0.8538587848932676, + "grad_norm": 0.32570579648017883, + "learning_rate": 0.0002, + "loss": 1.0921, + "step": 780 + }, + { + "epoch": 0.8648056923918993, + "grad_norm": 0.3642968237400055, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 790 + }, + { + "epoch": 0.8757525998905309, + "grad_norm": 0.29713448882102966, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 800 + }, + { + "epoch": 0.8866995073891626, + "grad_norm": 0.23599444329738617, + "learning_rate": 0.0002, + "loss": 1.0978, + "step": 810 + }, + { + "epoch": 0.8976464148877942, + "grad_norm": 0.31522464752197266, + "learning_rate": 0.0002, + "loss": 1.1867, + "step": 820 + }, + { + "epoch": 0.9085933223864259, + "grad_norm": 0.32754790782928467, + "learning_rate": 0.0002, + "loss": 1.0208, + "step": 830 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 0.22741089761257172, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 840 + }, + { + "epoch": 0.9304871373836892, + "grad_norm": 0.3089679777622223, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 850 + }, + { + "epoch": 0.9414340448823207, + "grad_norm": 0.27440521121025085, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 860 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.3498363792896271, + "learning_rate": 0.0002, + "loss": 1.0417, + "step": 870 + }, + { + "epoch": 0.963327859879584, + "grad_norm": 0.47151854634284973, + "learning_rate": 0.0002, + "loss": 1.269, + "step": 880 + }, + { + "epoch": 0.9742747673782156, + "grad_norm": 0.24756591022014618, + "learning_rate": 0.0002, + "loss": 1.1174, + "step": 890 + }, + { + "epoch": 0.9852216748768473, + "grad_norm": 0.2600938677787781, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 900 + }, + { + "epoch": 0.9961685823754789, + "grad_norm": 0.2934586703777313, + "learning_rate": 0.0002, + "loss": 1.0968, + "step": 910 + }, + { + "epoch": 0.9994526546250684, + "eval_loss": 1.158464789390564, + "eval_runtime": 46.0774, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 1.194, + "step": 913 + }, + { + "epoch": 1.0071154898741106, + "grad_norm": 0.37776654958724976, + "learning_rate": 0.0002, + "loss": 1.0322, + "step": 920 + }, + { + "epoch": 1.0180623973727423, + "grad_norm": 0.31784629821777344, + "learning_rate": 0.0002, + "loss": 1.1346, + "step": 930 + }, + { + "epoch": 1.0290093048713738, + "grad_norm": 0.24244336783885956, + "learning_rate": 0.0002, + "loss": 0.8976, + "step": 940 + }, + { + "epoch": 1.0399562123700055, + "grad_norm": 0.3185454308986664, + "learning_rate": 0.0002, + "loss": 1.0172, + "step": 950 + }, + { + "epoch": 1.0509031198686372, + "grad_norm": 0.3589441478252411, + "learning_rate": 0.0002, + "loss": 1.1645, + "step": 960 + }, + { + "epoch": 1.0618500273672686, + "grad_norm": 0.38593578338623047, + "learning_rate": 0.0002, + "loss": 1.0317, + "step": 970 + }, + { + "epoch": 1.0727969348659003, + "grad_norm": 0.39694955945014954, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 980 + }, + { + "epoch": 1.083743842364532, + "grad_norm": 0.469817191362381, + "learning_rate": 0.0002, + "loss": 1.0893, + "step": 990 + }, + { + "epoch": 1.0946907498631637, + "grad_norm": 0.2634755074977875, + "learning_rate": 0.0002, + "loss": 0.983, + "step": 1000 + }, + { + "epoch": 1.1056376573617952, + "grad_norm": 0.43189436197280884, + "learning_rate": 0.0002, + "loss": 1.0144, + "step": 1010 + }, + { + "epoch": 1.116584564860427, + "grad_norm": 0.5559977889060974, + "learning_rate": 0.0002, + "loss": 0.9663, + "step": 1020 + }, + { + "epoch": 1.1275314723590586, + "grad_norm": 0.32100191712379456, + "learning_rate": 0.0002, + "loss": 1.0481, + "step": 1030 + }, + { + "epoch": 1.1384783798576903, + "grad_norm": 0.40179768204689026, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 1040 + }, + { + "epoch": 1.1494252873563218, + "grad_norm": 0.3659493029117584, + "learning_rate": 0.0002, + "loss": 1.0029, + "step": 1050 + }, + { + "epoch": 1.1603721948549535, + "grad_norm": 0.701704204082489, + "learning_rate": 0.0002, + "loss": 0.9597, + "step": 1060 + }, + { + "epoch": 1.1713191023535852, + "grad_norm": 0.3650563359260559, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 1070 + }, + { + "epoch": 1.1822660098522166, + "grad_norm": 0.3191976249217987, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 1080 + }, + { + "epoch": 1.1932129173508483, + "grad_norm": 0.3615441918373108, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 1090 + }, + { + "epoch": 1.20415982484948, + "grad_norm": 0.39474231004714966, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 1100 + }, + { + "epoch": 1.2151067323481117, + "grad_norm": 0.3752822279930115, + "learning_rate": 0.0002, + "loss": 0.9852, + "step": 1110 + }, + { + "epoch": 1.2260536398467432, + "grad_norm": 0.4165991246700287, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 1120 + }, + { + "epoch": 1.237000547345375, + "grad_norm": 0.5326506495475769, + "learning_rate": 0.0002, + "loss": 1.0244, + "step": 1130 + }, + { + "epoch": 1.2479474548440066, + "grad_norm": 0.48845794796943665, + "learning_rate": 0.0002, + "loss": 1.0542, + "step": 1140 + }, + { + "epoch": 1.2588943623426383, + "grad_norm": 0.29910150170326233, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 1150 + }, + { + "epoch": 1.2698412698412698, + "grad_norm": 0.5069725513458252, + "learning_rate": 0.0002, + "loss": 1.2233, + "step": 1160 + }, + { + "epoch": 1.2807881773399015, + "grad_norm": 0.29500406980514526, + "learning_rate": 0.0002, + "loss": 1.0992, + "step": 1170 + }, + { + "epoch": 1.2917350848385332, + "grad_norm": 0.4711538851261139, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 1180 + }, + { + "epoch": 1.3026819923371646, + "grad_norm": 0.4203340709209442, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 1190 + }, + { + "epoch": 1.3136288998357963, + "grad_norm": 0.36101874709129333, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 1200 + }, + { + "epoch": 1.324575807334428, + "grad_norm": 0.4608800411224365, + "learning_rate": 0.0002, + "loss": 1.085, + "step": 1210 + }, + { + "epoch": 1.3355227148330597, + "grad_norm": 0.6570906639099121, + "learning_rate": 0.0002, + "loss": 1.0695, + "step": 1220 + }, + { + "epoch": 1.3464696223316914, + "grad_norm": 0.5352164506912231, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 1230 + }, + { + "epoch": 1.357416529830323, + "grad_norm": 0.3885001242160797, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 1240 + }, + { + "epoch": 1.3683634373289546, + "grad_norm": 0.2987913489341736, + "learning_rate": 0.0002, + "loss": 1.0611, + "step": 1250 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.42070427536964417, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 1260 + }, + { + "epoch": 1.3902572523262178, + "grad_norm": 0.5957782864570618, + "learning_rate": 0.0002, + "loss": 1.1092, + "step": 1270 + }, + { + "epoch": 1.4012041598248495, + "grad_norm": 0.32898882031440735, + "learning_rate": 0.0002, + "loss": 1.0554, + "step": 1280 + }, + { + "epoch": 1.4121510673234812, + "grad_norm": 0.27624452114105225, + "learning_rate": 0.0002, + "loss": 0.909, + "step": 1290 + }, + { + "epoch": 1.4230979748221126, + "grad_norm": 0.49570828676223755, + "learning_rate": 0.0002, + "loss": 1.024, + "step": 1300 + }, + { + "epoch": 1.4340448823207443, + "grad_norm": 0.26191383600234985, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 1310 + }, + { + "epoch": 1.444991789819376, + "grad_norm": 0.35664042830467224, + "learning_rate": 0.0002, + "loss": 1.0788, + "step": 1320 + }, + { + "epoch": 1.4559386973180077, + "grad_norm": 0.45126354694366455, + "learning_rate": 0.0002, + "loss": 1.0108, + "step": 1330 + }, + { + "epoch": 1.4668856048166394, + "grad_norm": 0.37318357825279236, + "learning_rate": 0.0002, + "loss": 0.9571, + "step": 1340 + }, + { + "epoch": 1.477832512315271, + "grad_norm": 0.6428970098495483, + "learning_rate": 0.0002, + "loss": 1.0507, + "step": 1350 + }, + { + "epoch": 1.4887794198139026, + "grad_norm": 0.43256187438964844, + "learning_rate": 0.0002, + "loss": 1.0367, + "step": 1360 + }, + { + "epoch": 1.4997263273125343, + "grad_norm": 0.5343793630599976, + "learning_rate": 0.0002, + "loss": 1.1321, + "step": 1370 + }, + { + "epoch": 1.5106732348111658, + "grad_norm": 0.315437376499176, + "learning_rate": 0.0002, + "loss": 1.1054, + "step": 1380 + }, + { + "epoch": 1.5216201423097975, + "grad_norm": 0.41561153531074524, + "learning_rate": 0.0002, + "loss": 0.8916, + "step": 1390 + }, + { + "epoch": 1.5325670498084292, + "grad_norm": 0.3201070725917816, + "learning_rate": 0.0002, + "loss": 1.0391, + "step": 1400 + }, + { + "epoch": 1.5435139573070606, + "grad_norm": 0.505537211894989, + "learning_rate": 0.0002, + "loss": 1.14, + "step": 1410 + }, + { + "epoch": 1.5544608648056923, + "grad_norm": 0.3747410178184509, + "learning_rate": 0.0002, + "loss": 1.0775, + "step": 1420 + }, + { + "epoch": 1.565407772304324, + "grad_norm": 0.49385908246040344, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 1430 + }, + { + "epoch": 1.5763546798029555, + "grad_norm": 0.49831628799438477, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 1440 + }, + { + "epoch": 1.5873015873015874, + "grad_norm": 0.372127890586853, + "learning_rate": 0.0002, + "loss": 1.0079, + "step": 1450 + }, + { + "epoch": 1.598248494800219, + "grad_norm": 0.40070840716362, + "learning_rate": 0.0002, + "loss": 0.9931, + "step": 1460 + }, + { + "epoch": 1.6091954022988506, + "grad_norm": 0.34907400608062744, + "learning_rate": 0.0002, + "loss": 0.8954, + "step": 1470 + }, + { + "epoch": 1.6201423097974823, + "grad_norm": 0.4632418751716614, + "learning_rate": 0.0002, + "loss": 0.9743, + "step": 1480 + }, + { + "epoch": 1.6310892172961138, + "grad_norm": 0.40164515376091003, + "learning_rate": 0.0002, + "loss": 1.0103, + "step": 1490 + }, + { + "epoch": 1.6420361247947455, + "grad_norm": 0.3214994966983795, + "learning_rate": 0.0002, + "loss": 0.9523, + "step": 1500 + }, + { + "epoch": 1.6529830322933772, + "grad_norm": 0.3727897107601166, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 1510 + }, + { + "epoch": 1.6639299397920086, + "grad_norm": 0.3817640542984009, + "learning_rate": 0.0002, + "loss": 1.0443, + "step": 1520 + }, + { + "epoch": 1.6748768472906403, + "grad_norm": 0.5592136979103088, + "learning_rate": 0.0002, + "loss": 1.0511, + "step": 1530 + }, + { + "epoch": 1.685823754789272, + "grad_norm": 0.44636598229408264, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 1540 + }, + { + "epoch": 1.6967706622879035, + "grad_norm": 0.40441709756851196, + "learning_rate": 0.0002, + "loss": 1.033, + "step": 1550 + }, + { + "epoch": 1.7077175697865354, + "grad_norm": 0.3243522644042969, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 1560 + }, + { + "epoch": 1.718664477285167, + "grad_norm": 0.34277570247650146, + "learning_rate": 0.0002, + "loss": 0.9482, + "step": 1570 + }, + { + "epoch": 1.7296113847837986, + "grad_norm": 0.3279995024204254, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 1580 + }, + { + "epoch": 1.7405582922824303, + "grad_norm": 0.41968777775764465, + "learning_rate": 0.0002, + "loss": 0.9414, + "step": 1590 + }, + { + "epoch": 1.7515051997810618, + "grad_norm": 0.39464613795280457, + "learning_rate": 0.0002, + "loss": 0.9768, + "step": 1600 + }, + { + "epoch": 1.7624521072796935, + "grad_norm": 0.3839009404182434, + "learning_rate": 0.0002, + "loss": 1.0347, + "step": 1610 + }, + { + "epoch": 1.7733990147783252, + "grad_norm": 0.3250715434551239, + "learning_rate": 0.0002, + "loss": 0.9195, + "step": 1620 + }, + { + "epoch": 1.7843459222769567, + "grad_norm": 0.5166561007499695, + "learning_rate": 0.0002, + "loss": 1.0049, + "step": 1630 + }, + { + "epoch": 1.7952928297755884, + "grad_norm": 0.4115183353424072, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 1640 + }, + { + "epoch": 1.80623973727422, + "grad_norm": 0.373780220746994, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 1650 + }, + { + "epoch": 1.8171866447728515, + "grad_norm": 0.49697014689445496, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 1660 + }, + { + "epoch": 1.8281335522714834, + "grad_norm": 1.0308938026428223, + "learning_rate": 0.0002, + "loss": 1.0089, + "step": 1670 + }, + { + "epoch": 1.839080459770115, + "grad_norm": 0.4851366877555847, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 1680 + }, + { + "epoch": 1.8500273672687466, + "grad_norm": 0.3262481391429901, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 1690 + }, + { + "epoch": 1.8609742747673783, + "grad_norm": 0.6904496550559998, + "learning_rate": 0.0002, + "loss": 1.048, + "step": 1700 + }, + { + "epoch": 1.8719211822660098, + "grad_norm": 0.49789851903915405, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 1710 + }, + { + "epoch": 1.8828680897646415, + "grad_norm": 0.3035794198513031, + "learning_rate": 0.0002, + "loss": 1.0258, + "step": 1720 + }, + { + "epoch": 1.8938149972632732, + "grad_norm": 0.4588414430618286, + "learning_rate": 0.0002, + "loss": 0.9916, + "step": 1730 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.4313034117221832, + "learning_rate": 0.0002, + "loss": 0.9526, + "step": 1740 + }, + { + "epoch": 1.9157088122605364, + "grad_norm": 0.38562044501304626, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 1750 + }, + { + "epoch": 1.926655719759168, + "grad_norm": 0.46947410702705383, + "learning_rate": 0.0002, + "loss": 0.9763, + "step": 1760 + }, + { + "epoch": 1.9376026272577995, + "grad_norm": 0.3848404884338379, + "learning_rate": 0.0002, + "loss": 0.9967, + "step": 1770 + }, + { + "epoch": 1.9485495347564314, + "grad_norm": 0.30422744154930115, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 1780 + }, + { + "epoch": 1.959496442255063, + "grad_norm": 0.41100990772247314, + "learning_rate": 0.0002, + "loss": 1.022, + "step": 1790 + }, + { + "epoch": 1.9704433497536946, + "grad_norm": 0.3492335081100464, + "learning_rate": 0.0002, + "loss": 1.003, + "step": 1800 + }, + { + "epoch": 1.9813902572523263, + "grad_norm": 0.364577978849411, + "learning_rate": 0.0002, + "loss": 0.9891, + "step": 1810 + }, + { + "epoch": 1.9923371647509578, + "grad_norm": 0.4312075674533844, + "learning_rate": 0.0002, + "loss": 1.0699, + "step": 1820 + }, + { + "epoch": 2.0, + "eval_loss": 1.14472496509552, + "eval_runtime": 46.0786, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 1.194, + "step": 1827 + } + ], + "logging_steps": 10, + "max_steps": 7304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.384156326264832e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..895406585c61ed7f5a6760976fc06a40a6fd8b3b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a158bca77bc6d58bfffd2acd2e9e641d481e1857fcbdd68fee875b74797abbc4 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6f2d144ec6a452a0f1a79ed3c4f07edef4497c12 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:035921963b7ba3426c7b25b29c3e4e62d3764d0ef3b24abb0b53606c314dc350 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..444b1b7b10d84d1c897d3a25b29cd96d5c716d38 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d210241b2fbe25b1e819460216e4143f1484c8e6fc3636c95d1498e9b095ca51 +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1acbae1b01538922e3bad83dee13f59af4330f7e --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf917d2e89ee39a3e9cdfb5c9be783c7806b020f29415fe37710fbbf43833346 +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f0dc91e31a4046330b836b2990c20d6b3f2f25e --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c054e94e5454455e09f8cd2be3390222c38c11f62f45d2b0a3f8280bdbc1c254 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9a29bf43ac8715abdcc5aa8f9336e317817624c7 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/trainer_state.json @@ -0,0 +1,1975 @@ +{ + "best_metric": 1.14472496509552, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827", + "epoch": 2.9994526546250686, + "eval_steps": 10, + "global_step": 2740, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.010946907498631636, + "grad_norm": 0.7611560821533203, + "learning_rate": 0.0002, + "loss": 2.4039, + "step": 10 + }, + { + "epoch": 0.021893814997263273, + "grad_norm": 0.4633193612098694, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 20 + }, + { + "epoch": 0.03284072249589491, + "grad_norm": 0.49326154589653015, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 30 + }, + { + "epoch": 0.043787629994526546, + "grad_norm": 0.3943138122558594, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 40 + }, + { + "epoch": 0.05473453749315818, + "grad_norm": 0.43292930722236633, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 50 + }, + { + "epoch": 0.06568144499178982, + "grad_norm": 0.3431817591190338, + "learning_rate": 0.0002, + "loss": 1.2427, + "step": 60 + }, + { + "epoch": 0.07662835249042145, + "grad_norm": 0.38011446595191956, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 70 + }, + { + "epoch": 0.08757525998905309, + "grad_norm": 0.7441071271896362, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 80 + }, + { + "epoch": 0.09852216748768473, + "grad_norm": 0.3359833061695099, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 90 + }, + { + "epoch": 0.10946907498631636, + "grad_norm": 0.3724392354488373, + "learning_rate": 0.0002, + "loss": 1.2774, + "step": 100 + }, + { + "epoch": 0.120415982484948, + "grad_norm": 0.40673762559890747, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 110 + }, + { + "epoch": 0.13136288998357964, + "grad_norm": 0.40036800503730774, + "learning_rate": 0.0002, + "loss": 1.2611, + "step": 120 + }, + { + "epoch": 0.1423097974822113, + "grad_norm": 2.844191312789917, + "learning_rate": 0.0002, + "loss": 1.2436, + "step": 130 + }, + { + "epoch": 0.1532567049808429, + "grad_norm": 0.3104734420776367, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 140 + }, + { + "epoch": 0.16420361247947454, + "grad_norm": 0.3266797959804535, + "learning_rate": 0.0002, + "loss": 1.0498, + "step": 150 + }, + { + "epoch": 0.17515051997810618, + "grad_norm": 0.3079199194908142, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 160 + }, + { + "epoch": 0.18609742747673783, + "grad_norm": 0.3872479498386383, + "learning_rate": 0.0002, + "loss": 1.2923, + "step": 170 + }, + { + "epoch": 0.19704433497536947, + "grad_norm": 0.38654500246047974, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 180 + }, + { + "epoch": 0.20799124247400108, + "grad_norm": 0.2913552522659302, + "learning_rate": 0.0002, + "loss": 1.066, + "step": 190 + }, + { + "epoch": 0.21893814997263272, + "grad_norm": 0.2960572838783264, + "learning_rate": 0.0002, + "loss": 1.0868, + "step": 200 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 0.5175501108169556, + "learning_rate": 0.0002, + "loss": 1.136, + "step": 210 + }, + { + "epoch": 0.240831964969896, + "grad_norm": 1.2921574115753174, + "learning_rate": 0.0002, + "loss": 1.1556, + "step": 220 + }, + { + "epoch": 0.25177887246852765, + "grad_norm": 0.2675512135028839, + "learning_rate": 0.0002, + "loss": 1.1488, + "step": 230 + }, + { + "epoch": 0.2627257799671593, + "grad_norm": 0.3956190049648285, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 240 + }, + { + "epoch": 0.27367268746579093, + "grad_norm": 0.6022581458091736, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 250 + }, + { + "epoch": 0.2846195949644226, + "grad_norm": 1.1949563026428223, + "learning_rate": 0.0002, + "loss": 1.1981, + "step": 260 + }, + { + "epoch": 0.2955665024630542, + "grad_norm": 0.31173548102378845, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 270 + }, + { + "epoch": 0.3065134099616858, + "grad_norm": 0.2808472812175751, + "learning_rate": 0.0002, + "loss": 1.06, + "step": 280 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 0.3042023777961731, + "learning_rate": 0.0002, + "loss": 1.1752, + "step": 290 + }, + { + "epoch": 0.3284072249589491, + "grad_norm": 0.39915043115615845, + "learning_rate": 0.0002, + "loss": 1.3147, + "step": 300 + }, + { + "epoch": 0.3393541324575807, + "grad_norm": 0.39118197560310364, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 310 + }, + { + "epoch": 0.35030103995621237, + "grad_norm": 0.355010986328125, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 320 + }, + { + "epoch": 0.361247947454844, + "grad_norm": 0.29734086990356445, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 330 + }, + { + "epoch": 0.37219485495347565, + "grad_norm": 0.346096009016037, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 340 + }, + { + "epoch": 0.3831417624521073, + "grad_norm": 0.4829643666744232, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 350 + }, + { + "epoch": 0.39408866995073893, + "grad_norm": 0.4726872742176056, + "learning_rate": 0.0002, + "loss": 1.2808, + "step": 360 + }, + { + "epoch": 0.4050355774493706, + "grad_norm": 0.3130153715610504, + "learning_rate": 0.0002, + "loss": 1.1532, + "step": 370 + }, + { + "epoch": 0.41598248494800216, + "grad_norm": 0.5123590230941772, + "learning_rate": 0.0002, + "loss": 1.1842, + "step": 380 + }, + { + "epoch": 0.4269293924466338, + "grad_norm": 0.3444574773311615, + "learning_rate": 0.0002, + "loss": 1.1539, + "step": 390 + }, + { + "epoch": 0.43787629994526545, + "grad_norm": 0.5302175283432007, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 400 + }, + { + "epoch": 0.4488232074438971, + "grad_norm": 0.2713572680950165, + "learning_rate": 0.0002, + "loss": 1.1138, + "step": 410 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 0.29530611634254456, + "learning_rate": 0.0002, + "loss": 1.1281, + "step": 420 + }, + { + "epoch": 0.47071702244116037, + "grad_norm": 0.27282455563545227, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 430 + }, + { + "epoch": 0.481663929939792, + "grad_norm": 0.2647949755191803, + "learning_rate": 0.0002, + "loss": 1.1213, + "step": 440 + }, + { + "epoch": 0.49261083743842365, + "grad_norm": 0.35509347915649414, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 450 + }, + { + "epoch": 0.5035577449370553, + "grad_norm": 0.1959609091281891, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 460 + }, + { + "epoch": 0.5145046524356869, + "grad_norm": 0.40090155601501465, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 470 + }, + { + "epoch": 0.5254515599343186, + "grad_norm": 0.3354604244232178, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 480 + }, + { + "epoch": 0.5363984674329502, + "grad_norm": 0.2758506238460541, + "learning_rate": 0.0002, + "loss": 1.1024, + "step": 490 + }, + { + "epoch": 0.5473453749315819, + "grad_norm": 0.3629051744937897, + "learning_rate": 0.0002, + "loss": 1.1108, + "step": 500 + }, + { + "epoch": 0.5582922824302134, + "grad_norm": 0.30802229046821594, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 510 + }, + { + "epoch": 0.5692391899288451, + "grad_norm": 0.3099463880062103, + "learning_rate": 0.0002, + "loss": 1.0424, + "step": 520 + }, + { + "epoch": 0.5801860974274767, + "grad_norm": 0.42299067974090576, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 530 + }, + { + "epoch": 0.5911330049261084, + "grad_norm": 0.5392252802848816, + "learning_rate": 0.0002, + "loss": 1.1698, + "step": 540 + }, + { + "epoch": 0.60207991242474, + "grad_norm": 0.34768250584602356, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 550 + }, + { + "epoch": 0.6130268199233716, + "grad_norm": 0.28490015864372253, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 560 + }, + { + "epoch": 0.6239737274220033, + "grad_norm": 0.34787994623184204, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 570 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.29058772325515747, + "learning_rate": 0.0002, + "loss": 1.0759, + "step": 580 + }, + { + "epoch": 0.6458675424192666, + "grad_norm": 0.4063778817653656, + "learning_rate": 0.0002, + "loss": 1.1157, + "step": 590 + }, + { + "epoch": 0.6568144499178982, + "grad_norm": 0.9244267344474792, + "learning_rate": 0.0002, + "loss": 1.1432, + "step": 600 + }, + { + "epoch": 0.6677613574165299, + "grad_norm": 0.27605190873146057, + "learning_rate": 0.0002, + "loss": 1.0591, + "step": 610 + }, + { + "epoch": 0.6787082649151615, + "grad_norm": 0.34346821904182434, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 620 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.3093279302120209, + "learning_rate": 0.0002, + "loss": 1.2195, + "step": 630 + }, + { + "epoch": 0.7006020799124247, + "grad_norm": 1.0069009065628052, + "learning_rate": 0.0002, + "loss": 1.2461, + "step": 640 + }, + { + "epoch": 0.7115489874110563, + "grad_norm": 0.5049130916595459, + "learning_rate": 0.0002, + "loss": 1.0892, + "step": 650 + }, + { + "epoch": 0.722495894909688, + "grad_norm": 0.3748924732208252, + "learning_rate": 0.0002, + "loss": 1.1404, + "step": 660 + }, + { + "epoch": 0.7334428024083196, + "grad_norm": 0.2964959144592285, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 670 + }, + { + "epoch": 0.7443897099069513, + "grad_norm": 0.4599986970424652, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 680 + }, + { + "epoch": 0.7553366174055829, + "grad_norm": 0.27292951941490173, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 690 + }, + { + "epoch": 0.7662835249042146, + "grad_norm": 0.3123566806316376, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 700 + }, + { + "epoch": 0.7772304324028462, + "grad_norm": 0.28310710191726685, + "learning_rate": 0.0002, + "loss": 1.0021, + "step": 710 + }, + { + "epoch": 0.7881773399014779, + "grad_norm": 0.3279992341995239, + "learning_rate": 0.0002, + "loss": 1.2281, + "step": 720 + }, + { + "epoch": 0.7991242474001095, + "grad_norm": 0.28179168701171875, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 730 + }, + { + "epoch": 0.8100711548987412, + "grad_norm": 0.31492987275123596, + "learning_rate": 0.0002, + "loss": 1.0602, + "step": 740 + }, + { + "epoch": 0.8210180623973727, + "grad_norm": 0.41821011900901794, + "learning_rate": 0.0002, + "loss": 1.2518, + "step": 750 + }, + { + "epoch": 0.8319649698960043, + "grad_norm": 0.325235515832901, + "learning_rate": 0.0002, + "loss": 1.1612, + "step": 760 + }, + { + "epoch": 0.842911877394636, + "grad_norm": 0.5366070866584778, + "learning_rate": 0.0002, + "loss": 1.27, + "step": 770 + }, + { + "epoch": 0.8538587848932676, + "grad_norm": 0.32570579648017883, + "learning_rate": 0.0002, + "loss": 1.0921, + "step": 780 + }, + { + "epoch": 0.8648056923918993, + "grad_norm": 0.3642968237400055, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 790 + }, + { + "epoch": 0.8757525998905309, + "grad_norm": 0.29713448882102966, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 800 + }, + { + "epoch": 0.8866995073891626, + "grad_norm": 0.23599444329738617, + "learning_rate": 0.0002, + "loss": 1.0978, + "step": 810 + }, + { + "epoch": 0.8976464148877942, + "grad_norm": 0.31522464752197266, + "learning_rate": 0.0002, + "loss": 1.1867, + "step": 820 + }, + { + "epoch": 0.9085933223864259, + "grad_norm": 0.32754790782928467, + "learning_rate": 0.0002, + "loss": 1.0208, + "step": 830 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 0.22741089761257172, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 840 + }, + { + "epoch": 0.9304871373836892, + "grad_norm": 0.3089679777622223, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 850 + }, + { + "epoch": 0.9414340448823207, + "grad_norm": 0.27440521121025085, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 860 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.3498363792896271, + "learning_rate": 0.0002, + "loss": 1.0417, + "step": 870 + }, + { + "epoch": 0.963327859879584, + "grad_norm": 0.47151854634284973, + "learning_rate": 0.0002, + "loss": 1.269, + "step": 880 + }, + { + "epoch": 0.9742747673782156, + "grad_norm": 0.24756591022014618, + "learning_rate": 0.0002, + "loss": 1.1174, + "step": 890 + }, + { + "epoch": 0.9852216748768473, + "grad_norm": 0.2600938677787781, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 900 + }, + { + "epoch": 0.9961685823754789, + "grad_norm": 0.2934586703777313, + "learning_rate": 0.0002, + "loss": 1.0968, + "step": 910 + }, + { + "epoch": 0.9994526546250684, + "eval_loss": 1.158464789390564, + "eval_runtime": 46.0774, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 1.194, + "step": 913 + }, + { + "epoch": 1.0071154898741106, + "grad_norm": 0.37776654958724976, + "learning_rate": 0.0002, + "loss": 1.0322, + "step": 920 + }, + { + "epoch": 1.0180623973727423, + "grad_norm": 0.31784629821777344, + "learning_rate": 0.0002, + "loss": 1.1346, + "step": 930 + }, + { + "epoch": 1.0290093048713738, + "grad_norm": 0.24244336783885956, + "learning_rate": 0.0002, + "loss": 0.8976, + "step": 940 + }, + { + "epoch": 1.0399562123700055, + "grad_norm": 0.3185454308986664, + "learning_rate": 0.0002, + "loss": 1.0172, + "step": 950 + }, + { + "epoch": 1.0509031198686372, + "grad_norm": 0.3589441478252411, + "learning_rate": 0.0002, + "loss": 1.1645, + "step": 960 + }, + { + "epoch": 1.0618500273672686, + "grad_norm": 0.38593578338623047, + "learning_rate": 0.0002, + "loss": 1.0317, + "step": 970 + }, + { + "epoch": 1.0727969348659003, + "grad_norm": 0.39694955945014954, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 980 + }, + { + "epoch": 1.083743842364532, + "grad_norm": 0.469817191362381, + "learning_rate": 0.0002, + "loss": 1.0893, + "step": 990 + }, + { + "epoch": 1.0946907498631637, + "grad_norm": 0.2634755074977875, + "learning_rate": 0.0002, + "loss": 0.983, + "step": 1000 + }, + { + "epoch": 1.1056376573617952, + "grad_norm": 0.43189436197280884, + "learning_rate": 0.0002, + "loss": 1.0144, + "step": 1010 + }, + { + "epoch": 1.116584564860427, + "grad_norm": 0.5559977889060974, + "learning_rate": 0.0002, + "loss": 0.9663, + "step": 1020 + }, + { + "epoch": 1.1275314723590586, + "grad_norm": 0.32100191712379456, + "learning_rate": 0.0002, + "loss": 1.0481, + "step": 1030 + }, + { + "epoch": 1.1384783798576903, + "grad_norm": 0.40179768204689026, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 1040 + }, + { + "epoch": 1.1494252873563218, + "grad_norm": 0.3659493029117584, + "learning_rate": 0.0002, + "loss": 1.0029, + "step": 1050 + }, + { + "epoch": 1.1603721948549535, + "grad_norm": 0.701704204082489, + "learning_rate": 0.0002, + "loss": 0.9597, + "step": 1060 + }, + { + "epoch": 1.1713191023535852, + "grad_norm": 0.3650563359260559, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 1070 + }, + { + "epoch": 1.1822660098522166, + "grad_norm": 0.3191976249217987, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 1080 + }, + { + "epoch": 1.1932129173508483, + "grad_norm": 0.3615441918373108, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 1090 + }, + { + "epoch": 1.20415982484948, + "grad_norm": 0.39474231004714966, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 1100 + }, + { + "epoch": 1.2151067323481117, + "grad_norm": 0.3752822279930115, + "learning_rate": 0.0002, + "loss": 0.9852, + "step": 1110 + }, + { + "epoch": 1.2260536398467432, + "grad_norm": 0.4165991246700287, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 1120 + }, + { + "epoch": 1.237000547345375, + "grad_norm": 0.5326506495475769, + "learning_rate": 0.0002, + "loss": 1.0244, + "step": 1130 + }, + { + "epoch": 1.2479474548440066, + "grad_norm": 0.48845794796943665, + "learning_rate": 0.0002, + "loss": 1.0542, + "step": 1140 + }, + { + "epoch": 1.2588943623426383, + "grad_norm": 0.29910150170326233, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 1150 + }, + { + "epoch": 1.2698412698412698, + "grad_norm": 0.5069725513458252, + "learning_rate": 0.0002, + "loss": 1.2233, + "step": 1160 + }, + { + "epoch": 1.2807881773399015, + "grad_norm": 0.29500406980514526, + "learning_rate": 0.0002, + "loss": 1.0992, + "step": 1170 + }, + { + "epoch": 1.2917350848385332, + "grad_norm": 0.4711538851261139, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 1180 + }, + { + "epoch": 1.3026819923371646, + "grad_norm": 0.4203340709209442, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 1190 + }, + { + "epoch": 1.3136288998357963, + "grad_norm": 0.36101874709129333, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 1200 + }, + { + "epoch": 1.324575807334428, + "grad_norm": 0.4608800411224365, + "learning_rate": 0.0002, + "loss": 1.085, + "step": 1210 + }, + { + "epoch": 1.3355227148330597, + "grad_norm": 0.6570906639099121, + "learning_rate": 0.0002, + "loss": 1.0695, + "step": 1220 + }, + { + "epoch": 1.3464696223316914, + "grad_norm": 0.5352164506912231, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 1230 + }, + { + "epoch": 1.357416529830323, + "grad_norm": 0.3885001242160797, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 1240 + }, + { + "epoch": 1.3683634373289546, + "grad_norm": 0.2987913489341736, + "learning_rate": 0.0002, + "loss": 1.0611, + "step": 1250 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.42070427536964417, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 1260 + }, + { + "epoch": 1.3902572523262178, + "grad_norm": 0.5957782864570618, + "learning_rate": 0.0002, + "loss": 1.1092, + "step": 1270 + }, + { + "epoch": 1.4012041598248495, + "grad_norm": 0.32898882031440735, + "learning_rate": 0.0002, + "loss": 1.0554, + "step": 1280 + }, + { + "epoch": 1.4121510673234812, + "grad_norm": 0.27624452114105225, + "learning_rate": 0.0002, + "loss": 0.909, + "step": 1290 + }, + { + "epoch": 1.4230979748221126, + "grad_norm": 0.49570828676223755, + "learning_rate": 0.0002, + "loss": 1.024, + "step": 1300 + }, + { + "epoch": 1.4340448823207443, + "grad_norm": 0.26191383600234985, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 1310 + }, + { + "epoch": 1.444991789819376, + "grad_norm": 0.35664042830467224, + "learning_rate": 0.0002, + "loss": 1.0788, + "step": 1320 + }, + { + "epoch": 1.4559386973180077, + "grad_norm": 0.45126354694366455, + "learning_rate": 0.0002, + "loss": 1.0108, + "step": 1330 + }, + { + "epoch": 1.4668856048166394, + "grad_norm": 0.37318357825279236, + "learning_rate": 0.0002, + "loss": 0.9571, + "step": 1340 + }, + { + "epoch": 1.477832512315271, + "grad_norm": 0.6428970098495483, + "learning_rate": 0.0002, + "loss": 1.0507, + "step": 1350 + }, + { + "epoch": 1.4887794198139026, + "grad_norm": 0.43256187438964844, + "learning_rate": 0.0002, + "loss": 1.0367, + "step": 1360 + }, + { + "epoch": 1.4997263273125343, + "grad_norm": 0.5343793630599976, + "learning_rate": 0.0002, + "loss": 1.1321, + "step": 1370 + }, + { + "epoch": 1.5106732348111658, + "grad_norm": 0.315437376499176, + "learning_rate": 0.0002, + "loss": 1.1054, + "step": 1380 + }, + { + "epoch": 1.5216201423097975, + "grad_norm": 0.41561153531074524, + "learning_rate": 0.0002, + "loss": 0.8916, + "step": 1390 + }, + { + "epoch": 1.5325670498084292, + "grad_norm": 0.3201070725917816, + "learning_rate": 0.0002, + "loss": 1.0391, + "step": 1400 + }, + { + "epoch": 1.5435139573070606, + "grad_norm": 0.505537211894989, + "learning_rate": 0.0002, + "loss": 1.14, + "step": 1410 + }, + { + "epoch": 1.5544608648056923, + "grad_norm": 0.3747410178184509, + "learning_rate": 0.0002, + "loss": 1.0775, + "step": 1420 + }, + { + "epoch": 1.565407772304324, + "grad_norm": 0.49385908246040344, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 1430 + }, + { + "epoch": 1.5763546798029555, + "grad_norm": 0.49831628799438477, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 1440 + }, + { + "epoch": 1.5873015873015874, + "grad_norm": 0.372127890586853, + "learning_rate": 0.0002, + "loss": 1.0079, + "step": 1450 + }, + { + "epoch": 1.598248494800219, + "grad_norm": 0.40070840716362, + "learning_rate": 0.0002, + "loss": 0.9931, + "step": 1460 + }, + { + "epoch": 1.6091954022988506, + "grad_norm": 0.34907400608062744, + "learning_rate": 0.0002, + "loss": 0.8954, + "step": 1470 + }, + { + "epoch": 1.6201423097974823, + "grad_norm": 0.4632418751716614, + "learning_rate": 0.0002, + "loss": 0.9743, + "step": 1480 + }, + { + "epoch": 1.6310892172961138, + "grad_norm": 0.40164515376091003, + "learning_rate": 0.0002, + "loss": 1.0103, + "step": 1490 + }, + { + "epoch": 1.6420361247947455, + "grad_norm": 0.3214994966983795, + "learning_rate": 0.0002, + "loss": 0.9523, + "step": 1500 + }, + { + "epoch": 1.6529830322933772, + "grad_norm": 0.3727897107601166, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 1510 + }, + { + "epoch": 1.6639299397920086, + "grad_norm": 0.3817640542984009, + "learning_rate": 0.0002, + "loss": 1.0443, + "step": 1520 + }, + { + "epoch": 1.6748768472906403, + "grad_norm": 0.5592136979103088, + "learning_rate": 0.0002, + "loss": 1.0511, + "step": 1530 + }, + { + "epoch": 1.685823754789272, + "grad_norm": 0.44636598229408264, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 1540 + }, + { + "epoch": 1.6967706622879035, + "grad_norm": 0.40441709756851196, + "learning_rate": 0.0002, + "loss": 1.033, + "step": 1550 + }, + { + "epoch": 1.7077175697865354, + "grad_norm": 0.3243522644042969, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 1560 + }, + { + "epoch": 1.718664477285167, + "grad_norm": 0.34277570247650146, + "learning_rate": 0.0002, + "loss": 0.9482, + "step": 1570 + }, + { + "epoch": 1.7296113847837986, + "grad_norm": 0.3279995024204254, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 1580 + }, + { + "epoch": 1.7405582922824303, + "grad_norm": 0.41968777775764465, + "learning_rate": 0.0002, + "loss": 0.9414, + "step": 1590 + }, + { + "epoch": 1.7515051997810618, + "grad_norm": 0.39464613795280457, + "learning_rate": 0.0002, + "loss": 0.9768, + "step": 1600 + }, + { + "epoch": 1.7624521072796935, + "grad_norm": 0.3839009404182434, + "learning_rate": 0.0002, + "loss": 1.0347, + "step": 1610 + }, + { + "epoch": 1.7733990147783252, + "grad_norm": 0.3250715434551239, + "learning_rate": 0.0002, + "loss": 0.9195, + "step": 1620 + }, + { + "epoch": 1.7843459222769567, + "grad_norm": 0.5166561007499695, + "learning_rate": 0.0002, + "loss": 1.0049, + "step": 1630 + }, + { + "epoch": 1.7952928297755884, + "grad_norm": 0.4115183353424072, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 1640 + }, + { + "epoch": 1.80623973727422, + "grad_norm": 0.373780220746994, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 1650 + }, + { + "epoch": 1.8171866447728515, + "grad_norm": 0.49697014689445496, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 1660 + }, + { + "epoch": 1.8281335522714834, + "grad_norm": 1.0308938026428223, + "learning_rate": 0.0002, + "loss": 1.0089, + "step": 1670 + }, + { + "epoch": 1.839080459770115, + "grad_norm": 0.4851366877555847, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 1680 + }, + { + "epoch": 1.8500273672687466, + "grad_norm": 0.3262481391429901, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 1690 + }, + { + "epoch": 1.8609742747673783, + "grad_norm": 0.6904496550559998, + "learning_rate": 0.0002, + "loss": 1.048, + "step": 1700 + }, + { + "epoch": 1.8719211822660098, + "grad_norm": 0.49789851903915405, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 1710 + }, + { + "epoch": 1.8828680897646415, + "grad_norm": 0.3035794198513031, + "learning_rate": 0.0002, + "loss": 1.0258, + "step": 1720 + }, + { + "epoch": 1.8938149972632732, + "grad_norm": 0.4588414430618286, + "learning_rate": 0.0002, + "loss": 0.9916, + "step": 1730 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.4313034117221832, + "learning_rate": 0.0002, + "loss": 0.9526, + "step": 1740 + }, + { + "epoch": 1.9157088122605364, + "grad_norm": 0.38562044501304626, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 1750 + }, + { + "epoch": 1.926655719759168, + "grad_norm": 0.46947410702705383, + "learning_rate": 0.0002, + "loss": 0.9763, + "step": 1760 + }, + { + "epoch": 1.9376026272577995, + "grad_norm": 0.3848404884338379, + "learning_rate": 0.0002, + "loss": 0.9967, + "step": 1770 + }, + { + "epoch": 1.9485495347564314, + "grad_norm": 0.30422744154930115, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 1780 + }, + { + "epoch": 1.959496442255063, + "grad_norm": 0.41100990772247314, + "learning_rate": 0.0002, + "loss": 1.022, + "step": 1790 + }, + { + "epoch": 1.9704433497536946, + "grad_norm": 0.3492335081100464, + "learning_rate": 0.0002, + "loss": 1.003, + "step": 1800 + }, + { + "epoch": 1.9813902572523263, + "grad_norm": 0.364577978849411, + "learning_rate": 0.0002, + "loss": 0.9891, + "step": 1810 + }, + { + "epoch": 1.9923371647509578, + "grad_norm": 0.4312075674533844, + "learning_rate": 0.0002, + "loss": 1.0699, + "step": 1820 + }, + { + "epoch": 2.0, + "eval_loss": 1.14472496509552, + "eval_runtime": 46.0786, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 1.194, + "step": 1827 + }, + { + "epoch": 2.0032840722495897, + "grad_norm": 0.5989689230918884, + "learning_rate": 0.0002, + "loss": 1.0218, + "step": 1830 + }, + { + "epoch": 2.014230979748221, + "grad_norm": 0.49720922112464905, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 1840 + }, + { + "epoch": 2.0251778872468527, + "grad_norm": 0.42675456404685974, + "learning_rate": 0.0002, + "loss": 0.8482, + "step": 1850 + }, + { + "epoch": 2.0361247947454846, + "grad_norm": 0.4637208580970764, + "learning_rate": 0.0002, + "loss": 0.8487, + "step": 1860 + }, + { + "epoch": 2.047071702244116, + "grad_norm": 0.8329976797103882, + "learning_rate": 0.0002, + "loss": 0.8212, + "step": 1870 + }, + { + "epoch": 2.0580186097427475, + "grad_norm": 0.7869427800178528, + "learning_rate": 0.0002, + "loss": 0.9547, + "step": 1880 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.4927455186843872, + "learning_rate": 0.0002, + "loss": 0.9351, + "step": 1890 + }, + { + "epoch": 2.079912424740011, + "grad_norm": 0.6264246702194214, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 1900 + }, + { + "epoch": 2.0908593322386424, + "grad_norm": 1.1164122819900513, + "learning_rate": 0.0002, + "loss": 0.9095, + "step": 1910 + }, + { + "epoch": 2.1018062397372743, + "grad_norm": 0.5283981561660767, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 1920 + }, + { + "epoch": 2.112753147235906, + "grad_norm": 0.45621731877326965, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 1930 + }, + { + "epoch": 2.1237000547345373, + "grad_norm": 1.381791591644287, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 1940 + }, + { + "epoch": 2.134646962233169, + "grad_norm": 0.5151259899139404, + "learning_rate": 0.0002, + "loss": 0.9006, + "step": 1950 + }, + { + "epoch": 2.1455938697318007, + "grad_norm": 0.9806339740753174, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 1960 + }, + { + "epoch": 2.1565407772304326, + "grad_norm": 0.4734154939651489, + "learning_rate": 0.0002, + "loss": 0.8749, + "step": 1970 + }, + { + "epoch": 2.167487684729064, + "grad_norm": 0.9553168416023254, + "learning_rate": 0.0002, + "loss": 0.9172, + "step": 1980 + }, + { + "epoch": 2.1784345922276955, + "grad_norm": 0.5895838141441345, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1990 + }, + { + "epoch": 2.1893814997263275, + "grad_norm": 0.4488855302333832, + "learning_rate": 0.0002, + "loss": 0.7841, + "step": 2000 + }, + { + "epoch": 2.200328407224959, + "grad_norm": 1.0760235786437988, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 2010 + }, + { + "epoch": 2.2112753147235904, + "grad_norm": 0.5038785338401794, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 2020 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.59819495677948, + "learning_rate": 0.0002, + "loss": 0.8973, + "step": 2030 + }, + { + "epoch": 2.233169129720854, + "grad_norm": 0.5012075304985046, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 2040 + }, + { + "epoch": 2.2441160372194853, + "grad_norm": 0.44978439807891846, + "learning_rate": 0.0002, + "loss": 0.9087, + "step": 2050 + }, + { + "epoch": 2.255062944718117, + "grad_norm": 0.5350462198257446, + "learning_rate": 0.0002, + "loss": 0.823, + "step": 2060 + }, + { + "epoch": 2.2660098522167487, + "grad_norm": 0.6020669937133789, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 2070 + }, + { + "epoch": 2.2769567597153806, + "grad_norm": 0.5246821045875549, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 2080 + }, + { + "epoch": 2.287903667214012, + "grad_norm": 0.5711268782615662, + "learning_rate": 0.0002, + "loss": 0.8984, + "step": 2090 + }, + { + "epoch": 2.2988505747126435, + "grad_norm": 0.617317259311676, + "learning_rate": 0.0002, + "loss": 0.9093, + "step": 2100 + }, + { + "epoch": 2.3097974822112755, + "grad_norm": 0.8608947396278381, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 2110 + }, + { + "epoch": 2.320744389709907, + "grad_norm": 0.4739076793193817, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 2120 + }, + { + "epoch": 2.3316912972085384, + "grad_norm": 0.5538856983184814, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 2130 + }, + { + "epoch": 2.3426382047071703, + "grad_norm": 0.6064935326576233, + "learning_rate": 0.0002, + "loss": 0.8994, + "step": 2140 + }, + { + "epoch": 2.353585112205802, + "grad_norm": 0.5019068121910095, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 2150 + }, + { + "epoch": 2.3645320197044333, + "grad_norm": 0.45340514183044434, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 2160 + }, + { + "epoch": 2.375478927203065, + "grad_norm": 0.7347203493118286, + "learning_rate": 0.0002, + "loss": 0.9254, + "step": 2170 + }, + { + "epoch": 2.3864258347016967, + "grad_norm": 0.46922534704208374, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 2180 + }, + { + "epoch": 2.3973727422003286, + "grad_norm": 0.5507845878601074, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2190 + }, + { + "epoch": 2.40831964969896, + "grad_norm": 0.5621911883354187, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2200 + }, + { + "epoch": 2.4192665571975915, + "grad_norm": 0.5023514032363892, + "learning_rate": 0.0002, + "loss": 0.8569, + "step": 2210 + }, + { + "epoch": 2.4302134646962235, + "grad_norm": 0.6124861240386963, + "learning_rate": 0.0002, + "loss": 0.8355, + "step": 2220 + }, + { + "epoch": 2.441160372194855, + "grad_norm": 0.49614205956459045, + "learning_rate": 0.0002, + "loss": 0.885, + "step": 2230 + }, + { + "epoch": 2.4521072796934864, + "grad_norm": 0.6477900743484497, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 2240 + }, + { + "epoch": 2.4630541871921183, + "grad_norm": 0.5868843793869019, + "learning_rate": 0.0002, + "loss": 0.8622, + "step": 2250 + }, + { + "epoch": 2.47400109469075, + "grad_norm": 0.4364610016345978, + "learning_rate": 0.0002, + "loss": 0.8498, + "step": 2260 + }, + { + "epoch": 2.4849480021893813, + "grad_norm": 0.5792964696884155, + "learning_rate": 0.0002, + "loss": 0.8378, + "step": 2270 + }, + { + "epoch": 2.495894909688013, + "grad_norm": 0.5421269536018372, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 2280 + }, + { + "epoch": 2.5068418171866447, + "grad_norm": 0.5525493025779724, + "learning_rate": 0.0002, + "loss": 0.9637, + "step": 2290 + }, + { + "epoch": 2.5177887246852766, + "grad_norm": 0.6463850140571594, + "learning_rate": 0.0002, + "loss": 0.8075, + "step": 2300 + }, + { + "epoch": 2.528735632183908, + "grad_norm": 0.6861311793327332, + "learning_rate": 0.0002, + "loss": 0.8591, + "step": 2310 + }, + { + "epoch": 2.5396825396825395, + "grad_norm": 0.5563555359840393, + "learning_rate": 0.0002, + "loss": 0.9287, + "step": 2320 + }, + { + "epoch": 2.5506294471811715, + "grad_norm": 0.5721169114112854, + "learning_rate": 0.0002, + "loss": 0.945, + "step": 2330 + }, + { + "epoch": 2.561576354679803, + "grad_norm": 0.5258274674415588, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 2340 + }, + { + "epoch": 2.572523262178435, + "grad_norm": 0.7057380676269531, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 2350 + }, + { + "epoch": 2.5834701696770663, + "grad_norm": 0.6869027614593506, + "learning_rate": 0.0002, + "loss": 0.8615, + "step": 2360 + }, + { + "epoch": 2.594417077175698, + "grad_norm": 0.4960809648036957, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2370 + }, + { + "epoch": 2.6053639846743293, + "grad_norm": 0.9288380146026611, + "learning_rate": 0.0002, + "loss": 0.8476, + "step": 2380 + }, + { + "epoch": 2.616310892172961, + "grad_norm": 0.3765334188938141, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 2390 + }, + { + "epoch": 2.6272577996715927, + "grad_norm": 0.7487865686416626, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 2400 + }, + { + "epoch": 2.6382047071702246, + "grad_norm": 0.6141156554222107, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 2410 + }, + { + "epoch": 2.649151614668856, + "grad_norm": 0.8420507907867432, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 2420 + }, + { + "epoch": 2.6600985221674875, + "grad_norm": 0.53386390209198, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 2430 + }, + { + "epoch": 2.6710454296661195, + "grad_norm": 0.5520607233047485, + "learning_rate": 0.0002, + "loss": 0.8486, + "step": 2440 + }, + { + "epoch": 2.681992337164751, + "grad_norm": 0.5337599515914917, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 2450 + }, + { + "epoch": 2.692939244663383, + "grad_norm": 0.48790836334228516, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 2460 + }, + { + "epoch": 2.7038861521620143, + "grad_norm": 0.8287786245346069, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 2470 + }, + { + "epoch": 2.714833059660646, + "grad_norm": 0.5876168608665466, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 2480 + }, + { + "epoch": 2.7257799671592773, + "grad_norm": 0.5206760764122009, + "learning_rate": 0.0002, + "loss": 0.8773, + "step": 2490 + }, + { + "epoch": 2.736726874657909, + "grad_norm": 0.5619136691093445, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 2500 + }, + { + "epoch": 2.7476737821565407, + "grad_norm": 0.5614883899688721, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 2510 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.6157700419425964, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2520 + }, + { + "epoch": 2.769567597153804, + "grad_norm": 0.5529953837394714, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 2530 + }, + { + "epoch": 2.7805145046524355, + "grad_norm": 0.6731224060058594, + "learning_rate": 0.0002, + "loss": 0.8881, + "step": 2540 + }, + { + "epoch": 2.7914614121510675, + "grad_norm": 0.6960386633872986, + "learning_rate": 0.0002, + "loss": 0.9125, + "step": 2550 + }, + { + "epoch": 2.802408319649699, + "grad_norm": 0.5203493237495422, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 2560 + }, + { + "epoch": 2.813355227148331, + "grad_norm": 1.036837100982666, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 2570 + }, + { + "epoch": 2.8243021346469623, + "grad_norm": 0.6125805377960205, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 2580 + }, + { + "epoch": 2.835249042145594, + "grad_norm": 0.6298092603683472, + "learning_rate": 0.0002, + "loss": 0.8939, + "step": 2590 + }, + { + "epoch": 2.8461959496442253, + "grad_norm": 0.5882203578948975, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2600 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.8619399666786194, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 2610 + }, + { + "epoch": 2.8680897646414887, + "grad_norm": 0.4722687304019928, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 2620 + }, + { + "epoch": 2.8790366721401206, + "grad_norm": 0.47399574518203735, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 2630 + }, + { + "epoch": 2.889983579638752, + "grad_norm": 0.5639172792434692, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2640 + }, + { + "epoch": 2.9009304871373836, + "grad_norm": 0.4676816761493683, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 2650 + }, + { + "epoch": 2.9118773946360155, + "grad_norm": 0.6906291246414185, + "learning_rate": 0.0002, + "loss": 0.8966, + "step": 2660 + }, + { + "epoch": 2.922824302134647, + "grad_norm": 0.4369746148586273, + "learning_rate": 0.0002, + "loss": 0.8966, + "step": 2670 + }, + { + "epoch": 2.933771209633279, + "grad_norm": 0.46423083543777466, + "learning_rate": 0.0002, + "loss": 0.9173, + "step": 2680 + }, + { + "epoch": 2.9447181171319103, + "grad_norm": 0.5700525045394897, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 2690 + }, + { + "epoch": 2.955665024630542, + "grad_norm": 0.6221476793289185, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 2700 + }, + { + "epoch": 2.9666119321291733, + "grad_norm": 0.6102682948112488, + "learning_rate": 0.0002, + "loss": 0.8538, + "step": 2710 + }, + { + "epoch": 2.977558839627805, + "grad_norm": 0.5317878723144531, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 2720 + }, + { + "epoch": 2.9885057471264367, + "grad_norm": 0.4438510835170746, + "learning_rate": 0.0002, + "loss": 0.9314, + "step": 2730 + }, + { + "epoch": 2.9994526546250686, + "grad_norm": 0.5022130012512207, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 2740 + }, + { + "epoch": 2.9994526546250686, + "eval_loss": 1.1722838878631592, + "eval_runtime": 46.0829, + "eval_samples_per_second": 9.461, + "eval_steps_per_second": 1.194, + "step": 2740 + } + ], + "logging_steps": 10, + "max_steps": 7304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4076234489397248e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..895406585c61ed7f5a6760976fc06a40a6fd8b3b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-2740/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a158bca77bc6d58bfffd2acd2e9e641d481e1857fcbdd68fee875b74797abbc4 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b37c20bd19b840d16f58d759d64f28458e6c763c --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a84388d4ac5738c377bd47af9ca50506439a278bc680479cf2db90b15345be18 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f9b81e2e43136a24c63b7a88ec19f174e1c3141 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93866aa1024d72e1ac54b17d93e37aa192ee0d448c3afaeafe3afb9d48dbf0de +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d0520f243b89737759e558104789698fffbe6097 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b06f9b81fb40fc97b17f99738dd19e250800243191cab070d3f261467f044a84 +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..07e4755f3a4387c1a7422a665bea15f78d9493f7 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5002c510756345ccba22822baa17f31534958c278f8a15afb59f45002a8e21d9 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7a222fa137afff280b5bdc15dcccc9b1ec90571a --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/trainer_state.json @@ -0,0 +1,2620 @@ +{ + "best_metric": 1.14472496509552, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 3654, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.010946907498631636, + "grad_norm": 0.7611560821533203, + "learning_rate": 0.0002, + "loss": 2.4039, + "step": 10 + }, + { + "epoch": 0.021893814997263273, + "grad_norm": 0.4633193612098694, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 20 + }, + { + "epoch": 0.03284072249589491, + "grad_norm": 0.49326154589653015, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 30 + }, + { + "epoch": 0.043787629994526546, + "grad_norm": 0.3943138122558594, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 40 + }, + { + "epoch": 0.05473453749315818, + "grad_norm": 0.43292930722236633, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 50 + }, + { + "epoch": 0.06568144499178982, + "grad_norm": 0.3431817591190338, + "learning_rate": 0.0002, + "loss": 1.2427, + "step": 60 + }, + { + "epoch": 0.07662835249042145, + "grad_norm": 0.38011446595191956, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 70 + }, + { + "epoch": 0.08757525998905309, + "grad_norm": 0.7441071271896362, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 80 + }, + { + "epoch": 0.09852216748768473, + "grad_norm": 0.3359833061695099, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 90 + }, + { + "epoch": 0.10946907498631636, + "grad_norm": 0.3724392354488373, + "learning_rate": 0.0002, + "loss": 1.2774, + "step": 100 + }, + { + "epoch": 0.120415982484948, + "grad_norm": 0.40673762559890747, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 110 + }, + { + "epoch": 0.13136288998357964, + "grad_norm": 0.40036800503730774, + "learning_rate": 0.0002, + "loss": 1.2611, + "step": 120 + }, + { + "epoch": 0.1423097974822113, + "grad_norm": 2.844191312789917, + "learning_rate": 0.0002, + "loss": 1.2436, + "step": 130 + }, + { + "epoch": 0.1532567049808429, + "grad_norm": 0.3104734420776367, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 140 + }, + { + "epoch": 0.16420361247947454, + "grad_norm": 0.3266797959804535, + "learning_rate": 0.0002, + "loss": 1.0498, + "step": 150 + }, + { + "epoch": 0.17515051997810618, + "grad_norm": 0.3079199194908142, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 160 + }, + { + "epoch": 0.18609742747673783, + "grad_norm": 0.3872479498386383, + "learning_rate": 0.0002, + "loss": 1.2923, + "step": 170 + }, + { + "epoch": 0.19704433497536947, + "grad_norm": 0.38654500246047974, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 180 + }, + { + "epoch": 0.20799124247400108, + "grad_norm": 0.2913552522659302, + "learning_rate": 0.0002, + "loss": 1.066, + "step": 190 + }, + { + "epoch": 0.21893814997263272, + "grad_norm": 0.2960572838783264, + "learning_rate": 0.0002, + "loss": 1.0868, + "step": 200 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 0.5175501108169556, + "learning_rate": 0.0002, + "loss": 1.136, + "step": 210 + }, + { + "epoch": 0.240831964969896, + "grad_norm": 1.2921574115753174, + "learning_rate": 0.0002, + "loss": 1.1556, + "step": 220 + }, + { + "epoch": 0.25177887246852765, + "grad_norm": 0.2675512135028839, + "learning_rate": 0.0002, + "loss": 1.1488, + "step": 230 + }, + { + "epoch": 0.2627257799671593, + "grad_norm": 0.3956190049648285, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 240 + }, + { + "epoch": 0.27367268746579093, + "grad_norm": 0.6022581458091736, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 250 + }, + { + "epoch": 0.2846195949644226, + "grad_norm": 1.1949563026428223, + "learning_rate": 0.0002, + "loss": 1.1981, + "step": 260 + }, + { + "epoch": 0.2955665024630542, + "grad_norm": 0.31173548102378845, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 270 + }, + { + "epoch": 0.3065134099616858, + "grad_norm": 0.2808472812175751, + "learning_rate": 0.0002, + "loss": 1.06, + "step": 280 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 0.3042023777961731, + "learning_rate": 0.0002, + "loss": 1.1752, + "step": 290 + }, + { + "epoch": 0.3284072249589491, + "grad_norm": 0.39915043115615845, + "learning_rate": 0.0002, + "loss": 1.3147, + "step": 300 + }, + { + "epoch": 0.3393541324575807, + "grad_norm": 0.39118197560310364, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 310 + }, + { + "epoch": 0.35030103995621237, + "grad_norm": 0.355010986328125, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 320 + }, + { + "epoch": 0.361247947454844, + "grad_norm": 0.29734086990356445, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 330 + }, + { + "epoch": 0.37219485495347565, + "grad_norm": 0.346096009016037, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 340 + }, + { + "epoch": 0.3831417624521073, + "grad_norm": 0.4829643666744232, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 350 + }, + { + "epoch": 0.39408866995073893, + "grad_norm": 0.4726872742176056, + "learning_rate": 0.0002, + "loss": 1.2808, + "step": 360 + }, + { + "epoch": 0.4050355774493706, + "grad_norm": 0.3130153715610504, + "learning_rate": 0.0002, + "loss": 1.1532, + "step": 370 + }, + { + "epoch": 0.41598248494800216, + "grad_norm": 0.5123590230941772, + "learning_rate": 0.0002, + "loss": 1.1842, + "step": 380 + }, + { + "epoch": 0.4269293924466338, + "grad_norm": 0.3444574773311615, + "learning_rate": 0.0002, + "loss": 1.1539, + "step": 390 + }, + { + "epoch": 0.43787629994526545, + "grad_norm": 0.5302175283432007, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 400 + }, + { + "epoch": 0.4488232074438971, + "grad_norm": 0.2713572680950165, + "learning_rate": 0.0002, + "loss": 1.1138, + "step": 410 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 0.29530611634254456, + "learning_rate": 0.0002, + "loss": 1.1281, + "step": 420 + }, + { + "epoch": 0.47071702244116037, + "grad_norm": 0.27282455563545227, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 430 + }, + { + "epoch": 0.481663929939792, + "grad_norm": 0.2647949755191803, + "learning_rate": 0.0002, + "loss": 1.1213, + "step": 440 + }, + { + "epoch": 0.49261083743842365, + "grad_norm": 0.35509347915649414, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 450 + }, + { + "epoch": 0.5035577449370553, + "grad_norm": 0.1959609091281891, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 460 + }, + { + "epoch": 0.5145046524356869, + "grad_norm": 0.40090155601501465, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 470 + }, + { + "epoch": 0.5254515599343186, + "grad_norm": 0.3354604244232178, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 480 + }, + { + "epoch": 0.5363984674329502, + "grad_norm": 0.2758506238460541, + "learning_rate": 0.0002, + "loss": 1.1024, + "step": 490 + }, + { + "epoch": 0.5473453749315819, + "grad_norm": 0.3629051744937897, + "learning_rate": 0.0002, + "loss": 1.1108, + "step": 500 + }, + { + "epoch": 0.5582922824302134, + "grad_norm": 0.30802229046821594, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 510 + }, + { + "epoch": 0.5692391899288451, + "grad_norm": 0.3099463880062103, + "learning_rate": 0.0002, + "loss": 1.0424, + "step": 520 + }, + { + "epoch": 0.5801860974274767, + "grad_norm": 0.42299067974090576, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 530 + }, + { + "epoch": 0.5911330049261084, + "grad_norm": 0.5392252802848816, + "learning_rate": 0.0002, + "loss": 1.1698, + "step": 540 + }, + { + "epoch": 0.60207991242474, + "grad_norm": 0.34768250584602356, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 550 + }, + { + "epoch": 0.6130268199233716, + "grad_norm": 0.28490015864372253, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 560 + }, + { + "epoch": 0.6239737274220033, + "grad_norm": 0.34787994623184204, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 570 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.29058772325515747, + "learning_rate": 0.0002, + "loss": 1.0759, + "step": 580 + }, + { + "epoch": 0.6458675424192666, + "grad_norm": 0.4063778817653656, + "learning_rate": 0.0002, + "loss": 1.1157, + "step": 590 + }, + { + "epoch": 0.6568144499178982, + "grad_norm": 0.9244267344474792, + "learning_rate": 0.0002, + "loss": 1.1432, + "step": 600 + }, + { + "epoch": 0.6677613574165299, + "grad_norm": 0.27605190873146057, + "learning_rate": 0.0002, + "loss": 1.0591, + "step": 610 + }, + { + "epoch": 0.6787082649151615, + "grad_norm": 0.34346821904182434, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 620 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.3093279302120209, + "learning_rate": 0.0002, + "loss": 1.2195, + "step": 630 + }, + { + "epoch": 0.7006020799124247, + "grad_norm": 1.0069009065628052, + "learning_rate": 0.0002, + "loss": 1.2461, + "step": 640 + }, + { + "epoch": 0.7115489874110563, + "grad_norm": 0.5049130916595459, + "learning_rate": 0.0002, + "loss": 1.0892, + "step": 650 + }, + { + "epoch": 0.722495894909688, + "grad_norm": 0.3748924732208252, + "learning_rate": 0.0002, + "loss": 1.1404, + "step": 660 + }, + { + "epoch": 0.7334428024083196, + "grad_norm": 0.2964959144592285, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 670 + }, + { + "epoch": 0.7443897099069513, + "grad_norm": 0.4599986970424652, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 680 + }, + { + "epoch": 0.7553366174055829, + "grad_norm": 0.27292951941490173, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 690 + }, + { + "epoch": 0.7662835249042146, + "grad_norm": 0.3123566806316376, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 700 + }, + { + "epoch": 0.7772304324028462, + "grad_norm": 0.28310710191726685, + "learning_rate": 0.0002, + "loss": 1.0021, + "step": 710 + }, + { + "epoch": 0.7881773399014779, + "grad_norm": 0.3279992341995239, + "learning_rate": 0.0002, + "loss": 1.2281, + "step": 720 + }, + { + "epoch": 0.7991242474001095, + "grad_norm": 0.28179168701171875, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 730 + }, + { + "epoch": 0.8100711548987412, + "grad_norm": 0.31492987275123596, + "learning_rate": 0.0002, + "loss": 1.0602, + "step": 740 + }, + { + "epoch": 0.8210180623973727, + "grad_norm": 0.41821011900901794, + "learning_rate": 0.0002, + "loss": 1.2518, + "step": 750 + }, + { + "epoch": 0.8319649698960043, + "grad_norm": 0.325235515832901, + "learning_rate": 0.0002, + "loss": 1.1612, + "step": 760 + }, + { + "epoch": 0.842911877394636, + "grad_norm": 0.5366070866584778, + "learning_rate": 0.0002, + "loss": 1.27, + "step": 770 + }, + { + "epoch": 0.8538587848932676, + "grad_norm": 0.32570579648017883, + "learning_rate": 0.0002, + "loss": 1.0921, + "step": 780 + }, + { + "epoch": 0.8648056923918993, + "grad_norm": 0.3642968237400055, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 790 + }, + { + "epoch": 0.8757525998905309, + "grad_norm": 0.29713448882102966, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 800 + }, + { + "epoch": 0.8866995073891626, + "grad_norm": 0.23599444329738617, + "learning_rate": 0.0002, + "loss": 1.0978, + "step": 810 + }, + { + "epoch": 0.8976464148877942, + "grad_norm": 0.31522464752197266, + "learning_rate": 0.0002, + "loss": 1.1867, + "step": 820 + }, + { + "epoch": 0.9085933223864259, + "grad_norm": 0.32754790782928467, + "learning_rate": 0.0002, + "loss": 1.0208, + "step": 830 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 0.22741089761257172, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 840 + }, + { + "epoch": 0.9304871373836892, + "grad_norm": 0.3089679777622223, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 850 + }, + { + "epoch": 0.9414340448823207, + "grad_norm": 0.27440521121025085, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 860 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.3498363792896271, + "learning_rate": 0.0002, + "loss": 1.0417, + "step": 870 + }, + { + "epoch": 0.963327859879584, + "grad_norm": 0.47151854634284973, + "learning_rate": 0.0002, + "loss": 1.269, + "step": 880 + }, + { + "epoch": 0.9742747673782156, + "grad_norm": 0.24756591022014618, + "learning_rate": 0.0002, + "loss": 1.1174, + "step": 890 + }, + { + "epoch": 0.9852216748768473, + "grad_norm": 0.2600938677787781, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 900 + }, + { + "epoch": 0.9961685823754789, + "grad_norm": 0.2934586703777313, + "learning_rate": 0.0002, + "loss": 1.0968, + "step": 910 + }, + { + "epoch": 0.9994526546250684, + "eval_loss": 1.158464789390564, + "eval_runtime": 46.0774, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 1.194, + "step": 913 + }, + { + "epoch": 1.0071154898741106, + "grad_norm": 0.37776654958724976, + "learning_rate": 0.0002, + "loss": 1.0322, + "step": 920 + }, + { + "epoch": 1.0180623973727423, + "grad_norm": 0.31784629821777344, + "learning_rate": 0.0002, + "loss": 1.1346, + "step": 930 + }, + { + "epoch": 1.0290093048713738, + "grad_norm": 0.24244336783885956, + "learning_rate": 0.0002, + "loss": 0.8976, + "step": 940 + }, + { + "epoch": 1.0399562123700055, + "grad_norm": 0.3185454308986664, + "learning_rate": 0.0002, + "loss": 1.0172, + "step": 950 + }, + { + "epoch": 1.0509031198686372, + "grad_norm": 0.3589441478252411, + "learning_rate": 0.0002, + "loss": 1.1645, + "step": 960 + }, + { + "epoch": 1.0618500273672686, + "grad_norm": 0.38593578338623047, + "learning_rate": 0.0002, + "loss": 1.0317, + "step": 970 + }, + { + "epoch": 1.0727969348659003, + "grad_norm": 0.39694955945014954, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 980 + }, + { + "epoch": 1.083743842364532, + "grad_norm": 0.469817191362381, + "learning_rate": 0.0002, + "loss": 1.0893, + "step": 990 + }, + { + "epoch": 1.0946907498631637, + "grad_norm": 0.2634755074977875, + "learning_rate": 0.0002, + "loss": 0.983, + "step": 1000 + }, + { + "epoch": 1.1056376573617952, + "grad_norm": 0.43189436197280884, + "learning_rate": 0.0002, + "loss": 1.0144, + "step": 1010 + }, + { + "epoch": 1.116584564860427, + "grad_norm": 0.5559977889060974, + "learning_rate": 0.0002, + "loss": 0.9663, + "step": 1020 + }, + { + "epoch": 1.1275314723590586, + "grad_norm": 0.32100191712379456, + "learning_rate": 0.0002, + "loss": 1.0481, + "step": 1030 + }, + { + "epoch": 1.1384783798576903, + "grad_norm": 0.40179768204689026, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 1040 + }, + { + "epoch": 1.1494252873563218, + "grad_norm": 0.3659493029117584, + "learning_rate": 0.0002, + "loss": 1.0029, + "step": 1050 + }, + { + "epoch": 1.1603721948549535, + "grad_norm": 0.701704204082489, + "learning_rate": 0.0002, + "loss": 0.9597, + "step": 1060 + }, + { + "epoch": 1.1713191023535852, + "grad_norm": 0.3650563359260559, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 1070 + }, + { + "epoch": 1.1822660098522166, + "grad_norm": 0.3191976249217987, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 1080 + }, + { + "epoch": 1.1932129173508483, + "grad_norm": 0.3615441918373108, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 1090 + }, + { + "epoch": 1.20415982484948, + "grad_norm": 0.39474231004714966, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 1100 + }, + { + "epoch": 1.2151067323481117, + "grad_norm": 0.3752822279930115, + "learning_rate": 0.0002, + "loss": 0.9852, + "step": 1110 + }, + { + "epoch": 1.2260536398467432, + "grad_norm": 0.4165991246700287, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 1120 + }, + { + "epoch": 1.237000547345375, + "grad_norm": 0.5326506495475769, + "learning_rate": 0.0002, + "loss": 1.0244, + "step": 1130 + }, + { + "epoch": 1.2479474548440066, + "grad_norm": 0.48845794796943665, + "learning_rate": 0.0002, + "loss": 1.0542, + "step": 1140 + }, + { + "epoch": 1.2588943623426383, + "grad_norm": 0.29910150170326233, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 1150 + }, + { + "epoch": 1.2698412698412698, + "grad_norm": 0.5069725513458252, + "learning_rate": 0.0002, + "loss": 1.2233, + "step": 1160 + }, + { + "epoch": 1.2807881773399015, + "grad_norm": 0.29500406980514526, + "learning_rate": 0.0002, + "loss": 1.0992, + "step": 1170 + }, + { + "epoch": 1.2917350848385332, + "grad_norm": 0.4711538851261139, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 1180 + }, + { + "epoch": 1.3026819923371646, + "grad_norm": 0.4203340709209442, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 1190 + }, + { + "epoch": 1.3136288998357963, + "grad_norm": 0.36101874709129333, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 1200 + }, + { + "epoch": 1.324575807334428, + "grad_norm": 0.4608800411224365, + "learning_rate": 0.0002, + "loss": 1.085, + "step": 1210 + }, + { + "epoch": 1.3355227148330597, + "grad_norm": 0.6570906639099121, + "learning_rate": 0.0002, + "loss": 1.0695, + "step": 1220 + }, + { + "epoch": 1.3464696223316914, + "grad_norm": 0.5352164506912231, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 1230 + }, + { + "epoch": 1.357416529830323, + "grad_norm": 0.3885001242160797, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 1240 + }, + { + "epoch": 1.3683634373289546, + "grad_norm": 0.2987913489341736, + "learning_rate": 0.0002, + "loss": 1.0611, + "step": 1250 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.42070427536964417, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 1260 + }, + { + "epoch": 1.3902572523262178, + "grad_norm": 0.5957782864570618, + "learning_rate": 0.0002, + "loss": 1.1092, + "step": 1270 + }, + { + "epoch": 1.4012041598248495, + "grad_norm": 0.32898882031440735, + "learning_rate": 0.0002, + "loss": 1.0554, + "step": 1280 + }, + { + "epoch": 1.4121510673234812, + "grad_norm": 0.27624452114105225, + "learning_rate": 0.0002, + "loss": 0.909, + "step": 1290 + }, + { + "epoch": 1.4230979748221126, + "grad_norm": 0.49570828676223755, + "learning_rate": 0.0002, + "loss": 1.024, + "step": 1300 + }, + { + "epoch": 1.4340448823207443, + "grad_norm": 0.26191383600234985, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 1310 + }, + { + "epoch": 1.444991789819376, + "grad_norm": 0.35664042830467224, + "learning_rate": 0.0002, + "loss": 1.0788, + "step": 1320 + }, + { + "epoch": 1.4559386973180077, + "grad_norm": 0.45126354694366455, + "learning_rate": 0.0002, + "loss": 1.0108, + "step": 1330 + }, + { + "epoch": 1.4668856048166394, + "grad_norm": 0.37318357825279236, + "learning_rate": 0.0002, + "loss": 0.9571, + "step": 1340 + }, + { + "epoch": 1.477832512315271, + "grad_norm": 0.6428970098495483, + "learning_rate": 0.0002, + "loss": 1.0507, + "step": 1350 + }, + { + "epoch": 1.4887794198139026, + "grad_norm": 0.43256187438964844, + "learning_rate": 0.0002, + "loss": 1.0367, + "step": 1360 + }, + { + "epoch": 1.4997263273125343, + "grad_norm": 0.5343793630599976, + "learning_rate": 0.0002, + "loss": 1.1321, + "step": 1370 + }, + { + "epoch": 1.5106732348111658, + "grad_norm": 0.315437376499176, + "learning_rate": 0.0002, + "loss": 1.1054, + "step": 1380 + }, + { + "epoch": 1.5216201423097975, + "grad_norm": 0.41561153531074524, + "learning_rate": 0.0002, + "loss": 0.8916, + "step": 1390 + }, + { + "epoch": 1.5325670498084292, + "grad_norm": 0.3201070725917816, + "learning_rate": 0.0002, + "loss": 1.0391, + "step": 1400 + }, + { + "epoch": 1.5435139573070606, + "grad_norm": 0.505537211894989, + "learning_rate": 0.0002, + "loss": 1.14, + "step": 1410 + }, + { + "epoch": 1.5544608648056923, + "grad_norm": 0.3747410178184509, + "learning_rate": 0.0002, + "loss": 1.0775, + "step": 1420 + }, + { + "epoch": 1.565407772304324, + "grad_norm": 0.49385908246040344, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 1430 + }, + { + "epoch": 1.5763546798029555, + "grad_norm": 0.49831628799438477, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 1440 + }, + { + "epoch": 1.5873015873015874, + "grad_norm": 0.372127890586853, + "learning_rate": 0.0002, + "loss": 1.0079, + "step": 1450 + }, + { + "epoch": 1.598248494800219, + "grad_norm": 0.40070840716362, + "learning_rate": 0.0002, + "loss": 0.9931, + "step": 1460 + }, + { + "epoch": 1.6091954022988506, + "grad_norm": 0.34907400608062744, + "learning_rate": 0.0002, + "loss": 0.8954, + "step": 1470 + }, + { + "epoch": 1.6201423097974823, + "grad_norm": 0.4632418751716614, + "learning_rate": 0.0002, + "loss": 0.9743, + "step": 1480 + }, + { + "epoch": 1.6310892172961138, + "grad_norm": 0.40164515376091003, + "learning_rate": 0.0002, + "loss": 1.0103, + "step": 1490 + }, + { + "epoch": 1.6420361247947455, + "grad_norm": 0.3214994966983795, + "learning_rate": 0.0002, + "loss": 0.9523, + "step": 1500 + }, + { + "epoch": 1.6529830322933772, + "grad_norm": 0.3727897107601166, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 1510 + }, + { + "epoch": 1.6639299397920086, + "grad_norm": 0.3817640542984009, + "learning_rate": 0.0002, + "loss": 1.0443, + "step": 1520 + }, + { + "epoch": 1.6748768472906403, + "grad_norm": 0.5592136979103088, + "learning_rate": 0.0002, + "loss": 1.0511, + "step": 1530 + }, + { + "epoch": 1.685823754789272, + "grad_norm": 0.44636598229408264, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 1540 + }, + { + "epoch": 1.6967706622879035, + "grad_norm": 0.40441709756851196, + "learning_rate": 0.0002, + "loss": 1.033, + "step": 1550 + }, + { + "epoch": 1.7077175697865354, + "grad_norm": 0.3243522644042969, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 1560 + }, + { + "epoch": 1.718664477285167, + "grad_norm": 0.34277570247650146, + "learning_rate": 0.0002, + "loss": 0.9482, + "step": 1570 + }, + { + "epoch": 1.7296113847837986, + "grad_norm": 0.3279995024204254, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 1580 + }, + { + "epoch": 1.7405582922824303, + "grad_norm": 0.41968777775764465, + "learning_rate": 0.0002, + "loss": 0.9414, + "step": 1590 + }, + { + "epoch": 1.7515051997810618, + "grad_norm": 0.39464613795280457, + "learning_rate": 0.0002, + "loss": 0.9768, + "step": 1600 + }, + { + "epoch": 1.7624521072796935, + "grad_norm": 0.3839009404182434, + "learning_rate": 0.0002, + "loss": 1.0347, + "step": 1610 + }, + { + "epoch": 1.7733990147783252, + "grad_norm": 0.3250715434551239, + "learning_rate": 0.0002, + "loss": 0.9195, + "step": 1620 + }, + { + "epoch": 1.7843459222769567, + "grad_norm": 0.5166561007499695, + "learning_rate": 0.0002, + "loss": 1.0049, + "step": 1630 + }, + { + "epoch": 1.7952928297755884, + "grad_norm": 0.4115183353424072, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 1640 + }, + { + "epoch": 1.80623973727422, + "grad_norm": 0.373780220746994, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 1650 + }, + { + "epoch": 1.8171866447728515, + "grad_norm": 0.49697014689445496, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 1660 + }, + { + "epoch": 1.8281335522714834, + "grad_norm": 1.0308938026428223, + "learning_rate": 0.0002, + "loss": 1.0089, + "step": 1670 + }, + { + "epoch": 1.839080459770115, + "grad_norm": 0.4851366877555847, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 1680 + }, + { + "epoch": 1.8500273672687466, + "grad_norm": 0.3262481391429901, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 1690 + }, + { + "epoch": 1.8609742747673783, + "grad_norm": 0.6904496550559998, + "learning_rate": 0.0002, + "loss": 1.048, + "step": 1700 + }, + { + "epoch": 1.8719211822660098, + "grad_norm": 0.49789851903915405, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 1710 + }, + { + "epoch": 1.8828680897646415, + "grad_norm": 0.3035794198513031, + "learning_rate": 0.0002, + "loss": 1.0258, + "step": 1720 + }, + { + "epoch": 1.8938149972632732, + "grad_norm": 0.4588414430618286, + "learning_rate": 0.0002, + "loss": 0.9916, + "step": 1730 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.4313034117221832, + "learning_rate": 0.0002, + "loss": 0.9526, + "step": 1740 + }, + { + "epoch": 1.9157088122605364, + "grad_norm": 0.38562044501304626, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 1750 + }, + { + "epoch": 1.926655719759168, + "grad_norm": 0.46947410702705383, + "learning_rate": 0.0002, + "loss": 0.9763, + "step": 1760 + }, + { + "epoch": 1.9376026272577995, + "grad_norm": 0.3848404884338379, + "learning_rate": 0.0002, + "loss": 0.9967, + "step": 1770 + }, + { + "epoch": 1.9485495347564314, + "grad_norm": 0.30422744154930115, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 1780 + }, + { + "epoch": 1.959496442255063, + "grad_norm": 0.41100990772247314, + "learning_rate": 0.0002, + "loss": 1.022, + "step": 1790 + }, + { + "epoch": 1.9704433497536946, + "grad_norm": 0.3492335081100464, + "learning_rate": 0.0002, + "loss": 1.003, + "step": 1800 + }, + { + "epoch": 1.9813902572523263, + "grad_norm": 0.364577978849411, + "learning_rate": 0.0002, + "loss": 0.9891, + "step": 1810 + }, + { + "epoch": 1.9923371647509578, + "grad_norm": 0.4312075674533844, + "learning_rate": 0.0002, + "loss": 1.0699, + "step": 1820 + }, + { + "epoch": 2.0, + "eval_loss": 1.14472496509552, + "eval_runtime": 46.0786, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 1.194, + "step": 1827 + }, + { + "epoch": 2.0032840722495897, + "grad_norm": 0.5989689230918884, + "learning_rate": 0.0002, + "loss": 1.0218, + "step": 1830 + }, + { + "epoch": 2.014230979748221, + "grad_norm": 0.49720922112464905, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 1840 + }, + { + "epoch": 2.0251778872468527, + "grad_norm": 0.42675456404685974, + "learning_rate": 0.0002, + "loss": 0.8482, + "step": 1850 + }, + { + "epoch": 2.0361247947454846, + "grad_norm": 0.4637208580970764, + "learning_rate": 0.0002, + "loss": 0.8487, + "step": 1860 + }, + { + "epoch": 2.047071702244116, + "grad_norm": 0.8329976797103882, + "learning_rate": 0.0002, + "loss": 0.8212, + "step": 1870 + }, + { + "epoch": 2.0580186097427475, + "grad_norm": 0.7869427800178528, + "learning_rate": 0.0002, + "loss": 0.9547, + "step": 1880 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.4927455186843872, + "learning_rate": 0.0002, + "loss": 0.9351, + "step": 1890 + }, + { + "epoch": 2.079912424740011, + "grad_norm": 0.6264246702194214, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 1900 + }, + { + "epoch": 2.0908593322386424, + "grad_norm": 1.1164122819900513, + "learning_rate": 0.0002, + "loss": 0.9095, + "step": 1910 + }, + { + "epoch": 2.1018062397372743, + "grad_norm": 0.5283981561660767, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 1920 + }, + { + "epoch": 2.112753147235906, + "grad_norm": 0.45621731877326965, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 1930 + }, + { + "epoch": 2.1237000547345373, + "grad_norm": 1.381791591644287, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 1940 + }, + { + "epoch": 2.134646962233169, + "grad_norm": 0.5151259899139404, + "learning_rate": 0.0002, + "loss": 0.9006, + "step": 1950 + }, + { + "epoch": 2.1455938697318007, + "grad_norm": 0.9806339740753174, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 1960 + }, + { + "epoch": 2.1565407772304326, + "grad_norm": 0.4734154939651489, + "learning_rate": 0.0002, + "loss": 0.8749, + "step": 1970 + }, + { + "epoch": 2.167487684729064, + "grad_norm": 0.9553168416023254, + "learning_rate": 0.0002, + "loss": 0.9172, + "step": 1980 + }, + { + "epoch": 2.1784345922276955, + "grad_norm": 0.5895838141441345, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1990 + }, + { + "epoch": 2.1893814997263275, + "grad_norm": 0.4488855302333832, + "learning_rate": 0.0002, + "loss": 0.7841, + "step": 2000 + }, + { + "epoch": 2.200328407224959, + "grad_norm": 1.0760235786437988, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 2010 + }, + { + "epoch": 2.2112753147235904, + "grad_norm": 0.5038785338401794, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 2020 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.59819495677948, + "learning_rate": 0.0002, + "loss": 0.8973, + "step": 2030 + }, + { + "epoch": 2.233169129720854, + "grad_norm": 0.5012075304985046, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 2040 + }, + { + "epoch": 2.2441160372194853, + "grad_norm": 0.44978439807891846, + "learning_rate": 0.0002, + "loss": 0.9087, + "step": 2050 + }, + { + "epoch": 2.255062944718117, + "grad_norm": 0.5350462198257446, + "learning_rate": 0.0002, + "loss": 0.823, + "step": 2060 + }, + { + "epoch": 2.2660098522167487, + "grad_norm": 0.6020669937133789, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 2070 + }, + { + "epoch": 2.2769567597153806, + "grad_norm": 0.5246821045875549, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 2080 + }, + { + "epoch": 2.287903667214012, + "grad_norm": 0.5711268782615662, + "learning_rate": 0.0002, + "loss": 0.8984, + "step": 2090 + }, + { + "epoch": 2.2988505747126435, + "grad_norm": 0.617317259311676, + "learning_rate": 0.0002, + "loss": 0.9093, + "step": 2100 + }, + { + "epoch": 2.3097974822112755, + "grad_norm": 0.8608947396278381, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 2110 + }, + { + "epoch": 2.320744389709907, + "grad_norm": 0.4739076793193817, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 2120 + }, + { + "epoch": 2.3316912972085384, + "grad_norm": 0.5538856983184814, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 2130 + }, + { + "epoch": 2.3426382047071703, + "grad_norm": 0.6064935326576233, + "learning_rate": 0.0002, + "loss": 0.8994, + "step": 2140 + }, + { + "epoch": 2.353585112205802, + "grad_norm": 0.5019068121910095, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 2150 + }, + { + "epoch": 2.3645320197044333, + "grad_norm": 0.45340514183044434, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 2160 + }, + { + "epoch": 2.375478927203065, + "grad_norm": 0.7347203493118286, + "learning_rate": 0.0002, + "loss": 0.9254, + "step": 2170 + }, + { + "epoch": 2.3864258347016967, + "grad_norm": 0.46922534704208374, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 2180 + }, + { + "epoch": 2.3973727422003286, + "grad_norm": 0.5507845878601074, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2190 + }, + { + "epoch": 2.40831964969896, + "grad_norm": 0.5621911883354187, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2200 + }, + { + "epoch": 2.4192665571975915, + "grad_norm": 0.5023514032363892, + "learning_rate": 0.0002, + "loss": 0.8569, + "step": 2210 + }, + { + "epoch": 2.4302134646962235, + "grad_norm": 0.6124861240386963, + "learning_rate": 0.0002, + "loss": 0.8355, + "step": 2220 + }, + { + "epoch": 2.441160372194855, + "grad_norm": 0.49614205956459045, + "learning_rate": 0.0002, + "loss": 0.885, + "step": 2230 + }, + { + "epoch": 2.4521072796934864, + "grad_norm": 0.6477900743484497, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 2240 + }, + { + "epoch": 2.4630541871921183, + "grad_norm": 0.5868843793869019, + "learning_rate": 0.0002, + "loss": 0.8622, + "step": 2250 + }, + { + "epoch": 2.47400109469075, + "grad_norm": 0.4364610016345978, + "learning_rate": 0.0002, + "loss": 0.8498, + "step": 2260 + }, + { + "epoch": 2.4849480021893813, + "grad_norm": 0.5792964696884155, + "learning_rate": 0.0002, + "loss": 0.8378, + "step": 2270 + }, + { + "epoch": 2.495894909688013, + "grad_norm": 0.5421269536018372, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 2280 + }, + { + "epoch": 2.5068418171866447, + "grad_norm": 0.5525493025779724, + "learning_rate": 0.0002, + "loss": 0.9637, + "step": 2290 + }, + { + "epoch": 2.5177887246852766, + "grad_norm": 0.6463850140571594, + "learning_rate": 0.0002, + "loss": 0.8075, + "step": 2300 + }, + { + "epoch": 2.528735632183908, + "grad_norm": 0.6861311793327332, + "learning_rate": 0.0002, + "loss": 0.8591, + "step": 2310 + }, + { + "epoch": 2.5396825396825395, + "grad_norm": 0.5563555359840393, + "learning_rate": 0.0002, + "loss": 0.9287, + "step": 2320 + }, + { + "epoch": 2.5506294471811715, + "grad_norm": 0.5721169114112854, + "learning_rate": 0.0002, + "loss": 0.945, + "step": 2330 + }, + { + "epoch": 2.561576354679803, + "grad_norm": 0.5258274674415588, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 2340 + }, + { + "epoch": 2.572523262178435, + "grad_norm": 0.7057380676269531, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 2350 + }, + { + "epoch": 2.5834701696770663, + "grad_norm": 0.6869027614593506, + "learning_rate": 0.0002, + "loss": 0.8615, + "step": 2360 + }, + { + "epoch": 2.594417077175698, + "grad_norm": 0.4960809648036957, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2370 + }, + { + "epoch": 2.6053639846743293, + "grad_norm": 0.9288380146026611, + "learning_rate": 0.0002, + "loss": 0.8476, + "step": 2380 + }, + { + "epoch": 2.616310892172961, + "grad_norm": 0.3765334188938141, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 2390 + }, + { + "epoch": 2.6272577996715927, + "grad_norm": 0.7487865686416626, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 2400 + }, + { + "epoch": 2.6382047071702246, + "grad_norm": 0.6141156554222107, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 2410 + }, + { + "epoch": 2.649151614668856, + "grad_norm": 0.8420507907867432, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 2420 + }, + { + "epoch": 2.6600985221674875, + "grad_norm": 0.53386390209198, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 2430 + }, + { + "epoch": 2.6710454296661195, + "grad_norm": 0.5520607233047485, + "learning_rate": 0.0002, + "loss": 0.8486, + "step": 2440 + }, + { + "epoch": 2.681992337164751, + "grad_norm": 0.5337599515914917, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 2450 + }, + { + "epoch": 2.692939244663383, + "grad_norm": 0.48790836334228516, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 2460 + }, + { + "epoch": 2.7038861521620143, + "grad_norm": 0.8287786245346069, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 2470 + }, + { + "epoch": 2.714833059660646, + "grad_norm": 0.5876168608665466, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 2480 + }, + { + "epoch": 2.7257799671592773, + "grad_norm": 0.5206760764122009, + "learning_rate": 0.0002, + "loss": 0.8773, + "step": 2490 + }, + { + "epoch": 2.736726874657909, + "grad_norm": 0.5619136691093445, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 2500 + }, + { + "epoch": 2.7476737821565407, + "grad_norm": 0.5614883899688721, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 2510 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.6157700419425964, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2520 + }, + { + "epoch": 2.769567597153804, + "grad_norm": 0.5529953837394714, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 2530 + }, + { + "epoch": 2.7805145046524355, + "grad_norm": 0.6731224060058594, + "learning_rate": 0.0002, + "loss": 0.8881, + "step": 2540 + }, + { + "epoch": 2.7914614121510675, + "grad_norm": 0.6960386633872986, + "learning_rate": 0.0002, + "loss": 0.9125, + "step": 2550 + }, + { + "epoch": 2.802408319649699, + "grad_norm": 0.5203493237495422, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 2560 + }, + { + "epoch": 2.813355227148331, + "grad_norm": 1.036837100982666, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 2570 + }, + { + "epoch": 2.8243021346469623, + "grad_norm": 0.6125805377960205, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 2580 + }, + { + "epoch": 2.835249042145594, + "grad_norm": 0.6298092603683472, + "learning_rate": 0.0002, + "loss": 0.8939, + "step": 2590 + }, + { + "epoch": 2.8461959496442253, + "grad_norm": 0.5882203578948975, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2600 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.8619399666786194, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 2610 + }, + { + "epoch": 2.8680897646414887, + "grad_norm": 0.4722687304019928, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 2620 + }, + { + "epoch": 2.8790366721401206, + "grad_norm": 0.47399574518203735, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 2630 + }, + { + "epoch": 2.889983579638752, + "grad_norm": 0.5639172792434692, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2640 + }, + { + "epoch": 2.9009304871373836, + "grad_norm": 0.4676816761493683, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 2650 + }, + { + "epoch": 2.9118773946360155, + "grad_norm": 0.6906291246414185, + "learning_rate": 0.0002, + "loss": 0.8966, + "step": 2660 + }, + { + "epoch": 2.922824302134647, + "grad_norm": 0.4369746148586273, + "learning_rate": 0.0002, + "loss": 0.8966, + "step": 2670 + }, + { + "epoch": 2.933771209633279, + "grad_norm": 0.46423083543777466, + "learning_rate": 0.0002, + "loss": 0.9173, + "step": 2680 + }, + { + "epoch": 2.9447181171319103, + "grad_norm": 0.5700525045394897, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 2690 + }, + { + "epoch": 2.955665024630542, + "grad_norm": 0.6221476793289185, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 2700 + }, + { + "epoch": 2.9666119321291733, + "grad_norm": 0.6102682948112488, + "learning_rate": 0.0002, + "loss": 0.8538, + "step": 2710 + }, + { + "epoch": 2.977558839627805, + "grad_norm": 0.5317878723144531, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 2720 + }, + { + "epoch": 2.9885057471264367, + "grad_norm": 0.4438510835170746, + "learning_rate": 0.0002, + "loss": 0.9314, + "step": 2730 + }, + { + "epoch": 2.9994526546250686, + "grad_norm": 0.5022130012512207, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 2740 + }, + { + "epoch": 2.9994526546250686, + "eval_loss": 1.1722838878631592, + "eval_runtime": 46.0829, + "eval_samples_per_second": 9.461, + "eval_steps_per_second": 1.194, + "step": 2740 + }, + { + "epoch": 3.0103995621237, + "grad_norm": 0.6384502053260803, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 2750 + }, + { + "epoch": 3.0213464696223316, + "grad_norm": 0.9928722381591797, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 2760 + }, + { + "epoch": 3.0322933771209635, + "grad_norm": 0.7813051342964172, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 2770 + }, + { + "epoch": 3.043240284619595, + "grad_norm": 1.0202556848526, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 2780 + }, + { + "epoch": 3.0541871921182264, + "grad_norm": 0.7581062316894531, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 2790 + }, + { + "epoch": 3.0651340996168583, + "grad_norm": 0.6252710223197937, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 2800 + }, + { + "epoch": 3.07608100711549, + "grad_norm": 0.7738662958145142, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 2810 + }, + { + "epoch": 3.0870279146141213, + "grad_norm": 0.7381885051727295, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 2820 + }, + { + "epoch": 3.097974822112753, + "grad_norm": 0.9197564721107483, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 2830 + }, + { + "epoch": 3.1089217296113847, + "grad_norm": 1.000976800918579, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 2840 + }, + { + "epoch": 3.1198686371100166, + "grad_norm": 0.7559131383895874, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 2850 + }, + { + "epoch": 3.130815544608648, + "grad_norm": 0.7213780879974365, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 2860 + }, + { + "epoch": 3.1417624521072796, + "grad_norm": 0.945939838886261, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 2870 + }, + { + "epoch": 3.1527093596059115, + "grad_norm": 0.7277454137802124, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 2880 + }, + { + "epoch": 3.163656267104543, + "grad_norm": 0.762026846408844, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 2890 + }, + { + "epoch": 3.1746031746031744, + "grad_norm": 0.6471221446990967, + "learning_rate": 0.0002, + "loss": 0.5332, + "step": 2900 + }, + { + "epoch": 3.1855500821018063, + "grad_norm": 0.6018978357315063, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 2910 + }, + { + "epoch": 3.196496989600438, + "grad_norm": 0.8607320785522461, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 2920 + }, + { + "epoch": 3.2074438970990693, + "grad_norm": 0.8854126334190369, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 2930 + }, + { + "epoch": 3.218390804597701, + "grad_norm": 0.6620870232582092, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 2940 + }, + { + "epoch": 3.2293377120963327, + "grad_norm": 0.7377511858940125, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 2950 + }, + { + "epoch": 3.2402846195949646, + "grad_norm": 0.7803301811218262, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 2960 + }, + { + "epoch": 3.251231527093596, + "grad_norm": 0.834061861038208, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 2970 + }, + { + "epoch": 3.2621784345922276, + "grad_norm": 0.8496041893959045, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 2980 + }, + { + "epoch": 3.2731253420908595, + "grad_norm": 0.7967984676361084, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 2990 + }, + { + "epoch": 3.284072249589491, + "grad_norm": 1.0207016468048096, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 3000 + }, + { + "epoch": 3.2950191570881224, + "grad_norm": 0.7855771780014038, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 3010 + }, + { + "epoch": 3.3059660645867543, + "grad_norm": 0.9427546858787537, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 3020 + }, + { + "epoch": 3.316912972085386, + "grad_norm": 0.823542594909668, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 3030 + }, + { + "epoch": 3.3278598795840173, + "grad_norm": 0.9826635122299194, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 3040 + }, + { + "epoch": 3.338806787082649, + "grad_norm": 0.7259827852249146, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 3050 + }, + { + "epoch": 3.3497536945812807, + "grad_norm": 0.7774739861488342, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 3060 + }, + { + "epoch": 3.3607006020799126, + "grad_norm": 0.7394293546676636, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 3070 + }, + { + "epoch": 3.371647509578544, + "grad_norm": 0.9017578959465027, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 3080 + }, + { + "epoch": 3.3825944170771756, + "grad_norm": 0.7451054453849792, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 3090 + }, + { + "epoch": 3.3935413245758075, + "grad_norm": 0.7321506142616272, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 3100 + }, + { + "epoch": 3.404488232074439, + "grad_norm": 0.6721828579902649, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 3110 + }, + { + "epoch": 3.4154351395730704, + "grad_norm": 0.774022102355957, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 3120 + }, + { + "epoch": 3.4263820470717024, + "grad_norm": 0.9143537282943726, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 3130 + }, + { + "epoch": 3.437328954570334, + "grad_norm": 1.226087212562561, + "learning_rate": 0.0002, + "loss": 0.6899, + "step": 3140 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 0.7545496225357056, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 3150 + }, + { + "epoch": 3.4592227695675972, + "grad_norm": 0.6515635848045349, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 3160 + }, + { + "epoch": 3.4701696770662287, + "grad_norm": 0.9297090172767639, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 3170 + }, + { + "epoch": 3.4811165845648606, + "grad_norm": 1.0130730867385864, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 3180 + }, + { + "epoch": 3.492063492063492, + "grad_norm": 0.7654589414596558, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 3190 + }, + { + "epoch": 3.5030103995621236, + "grad_norm": 0.9954977631568909, + "learning_rate": 0.0002, + "loss": 0.7401, + "step": 3200 + }, + { + "epoch": 3.5139573070607555, + "grad_norm": 0.6027487516403198, + "learning_rate": 0.0002, + "loss": 0.6661, + "step": 3210 + }, + { + "epoch": 3.524904214559387, + "grad_norm": 0.741770327091217, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 3220 + }, + { + "epoch": 3.535851122058019, + "grad_norm": 1.0534909963607788, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 3230 + }, + { + "epoch": 3.5467980295566504, + "grad_norm": 0.937772274017334, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 3240 + }, + { + "epoch": 3.557744937055282, + "grad_norm": 0.8504213690757751, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 3250 + }, + { + "epoch": 3.5686918445539133, + "grad_norm": 0.7755007147789001, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 3260 + }, + { + "epoch": 3.5796387520525452, + "grad_norm": 1.0193358659744263, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 3270 + }, + { + "epoch": 3.5905856595511767, + "grad_norm": 0.8440536856651306, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 3280 + }, + { + "epoch": 3.6015325670498086, + "grad_norm": 0.6195939183235168, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 3290 + }, + { + "epoch": 3.61247947454844, + "grad_norm": 0.8608590960502625, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 3300 + }, + { + "epoch": 3.6234263820470716, + "grad_norm": 0.6772327423095703, + "learning_rate": 0.0002, + "loss": 0.7301, + "step": 3310 + }, + { + "epoch": 3.6343732895457035, + "grad_norm": 0.8031839728355408, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 3320 + }, + { + "epoch": 3.645320197044335, + "grad_norm": 0.6080502271652222, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 3330 + }, + { + "epoch": 3.656267104542967, + "grad_norm": 0.8007240891456604, + "learning_rate": 0.0002, + "loss": 0.7431, + "step": 3340 + }, + { + "epoch": 3.6672140120415984, + "grad_norm": 0.8060704469680786, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 3350 + }, + { + "epoch": 3.67816091954023, + "grad_norm": 0.7547586560249329, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 3360 + }, + { + "epoch": 3.6891078270388613, + "grad_norm": 0.686851978302002, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 3370 + }, + { + "epoch": 3.7000547345374932, + "grad_norm": 0.9429075717926025, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 3380 + }, + { + "epoch": 3.7110016420361247, + "grad_norm": 0.7283591032028198, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 3390 + }, + { + "epoch": 3.7219485495347566, + "grad_norm": 0.8323085904121399, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 3400 + }, + { + "epoch": 3.732895457033388, + "grad_norm": 0.8529590964317322, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 3410 + }, + { + "epoch": 3.7438423645320196, + "grad_norm": 0.731752872467041, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 3420 + }, + { + "epoch": 3.7547892720306515, + "grad_norm": 0.8572278618812561, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3430 + }, + { + "epoch": 3.765736179529283, + "grad_norm": 0.7408691048622131, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 3440 + }, + { + "epoch": 3.776683087027915, + "grad_norm": 0.7470445036888123, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 3450 + }, + { + "epoch": 3.7876299945265464, + "grad_norm": 0.6806244254112244, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 3460 + }, + { + "epoch": 3.798576902025178, + "grad_norm": 0.9129069447517395, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 3470 + }, + { + "epoch": 3.8095238095238093, + "grad_norm": 0.8717501759529114, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 3480 + }, + { + "epoch": 3.8204707170224412, + "grad_norm": 0.6761979460716248, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 3490 + }, + { + "epoch": 3.8314176245210727, + "grad_norm": 1.0054380893707275, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 3500 + }, + { + "epoch": 3.8423645320197046, + "grad_norm": 1.1224009990692139, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 3510 + }, + { + "epoch": 3.853311439518336, + "grad_norm": 0.8997692465782166, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 3520 + }, + { + "epoch": 3.8642583470169676, + "grad_norm": 1.0086902379989624, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 3530 + }, + { + "epoch": 3.8752052545155995, + "grad_norm": 0.772739589214325, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 3540 + }, + { + "epoch": 3.886152162014231, + "grad_norm": 1.211774230003357, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 3550 + }, + { + "epoch": 3.897099069512863, + "grad_norm": 0.9572356939315796, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 3560 + }, + { + "epoch": 3.9080459770114944, + "grad_norm": 0.7887842655181885, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 3570 + }, + { + "epoch": 3.918992884510126, + "grad_norm": 0.7308389544487, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 3580 + }, + { + "epoch": 3.9299397920087573, + "grad_norm": 1.0182650089263916, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 3590 + }, + { + "epoch": 3.9408866995073892, + "grad_norm": 0.8000147342681885, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 3600 + }, + { + "epoch": 3.9518336070060207, + "grad_norm": 0.7385728359222412, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 3610 + }, + { + "epoch": 3.9627805145046526, + "grad_norm": 0.9233261942863464, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 3620 + }, + { + "epoch": 3.973727422003284, + "grad_norm": 0.8486751914024353, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 3630 + }, + { + "epoch": 3.9846743295019156, + "grad_norm": 0.7593663334846497, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3640 + }, + { + "epoch": 3.9956212370005475, + "grad_norm": 0.7885415554046631, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 3650 + }, + { + "epoch": 4.0, + "eval_loss": 1.250312328338623, + "eval_runtime": 46.0842, + "eval_samples_per_second": 9.461, + "eval_steps_per_second": 1.193, + "step": 3654 + } + ], + "logging_steps": 10, + "max_steps": 7304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8768312652529664e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..895406585c61ed7f5a6760976fc06a40a6fd8b3b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-3654/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a158bca77bc6d58bfffd2acd2e9e641d481e1857fcbdd68fee875b74797abbc4 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..54340503512ec66744b9f6346df09f21a03940b3 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:690f0a108c96b12c41c037e7a8a073b10e97220dd5bf277ebaf85d1405a0e83a +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ee5cacec4cdd22a3720a8050aa34b695971059b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a60d365e1ec200496f2cba711983f8608c9cb7d697c7c2b44a16f4348095ae96 +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..60d7a14ea93086849c11aba92a635f5276eb71eb --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ec7f713eac7d32af496212fbeebae513684ce562bcd5df63eb5603a6c82d35d +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b92c057ff7f770e9f74e39b0230dd2919efdd5b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81183db37481adb9d2771807b01e740317fdbbc85a4be1d990e73274e3f03be7 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4c1d3ca3731f68e72003ae018a16d840358b0a8d --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/trainer_state.json @@ -0,0 +1,3265 @@ +{ + "best_metric": 1.14472496509552, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827", + "epoch": 4.999452654625069, + "eval_steps": 10, + "global_step": 4567, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.010946907498631636, + "grad_norm": 0.7611560821533203, + "learning_rate": 0.0002, + "loss": 2.4039, + "step": 10 + }, + { + "epoch": 0.021893814997263273, + "grad_norm": 0.4633193612098694, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 20 + }, + { + "epoch": 0.03284072249589491, + "grad_norm": 0.49326154589653015, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 30 + }, + { + "epoch": 0.043787629994526546, + "grad_norm": 0.3943138122558594, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 40 + }, + { + "epoch": 0.05473453749315818, + "grad_norm": 0.43292930722236633, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 50 + }, + { + "epoch": 0.06568144499178982, + "grad_norm": 0.3431817591190338, + "learning_rate": 0.0002, + "loss": 1.2427, + "step": 60 + }, + { + "epoch": 0.07662835249042145, + "grad_norm": 0.38011446595191956, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 70 + }, + { + "epoch": 0.08757525998905309, + "grad_norm": 0.7441071271896362, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 80 + }, + { + "epoch": 0.09852216748768473, + "grad_norm": 0.3359833061695099, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 90 + }, + { + "epoch": 0.10946907498631636, + "grad_norm": 0.3724392354488373, + "learning_rate": 0.0002, + "loss": 1.2774, + "step": 100 + }, + { + "epoch": 0.120415982484948, + "grad_norm": 0.40673762559890747, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 110 + }, + { + "epoch": 0.13136288998357964, + "grad_norm": 0.40036800503730774, + "learning_rate": 0.0002, + "loss": 1.2611, + "step": 120 + }, + { + "epoch": 0.1423097974822113, + "grad_norm": 2.844191312789917, + "learning_rate": 0.0002, + "loss": 1.2436, + "step": 130 + }, + { + "epoch": 0.1532567049808429, + "grad_norm": 0.3104734420776367, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 140 + }, + { + "epoch": 0.16420361247947454, + "grad_norm": 0.3266797959804535, + "learning_rate": 0.0002, + "loss": 1.0498, + "step": 150 + }, + { + "epoch": 0.17515051997810618, + "grad_norm": 0.3079199194908142, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 160 + }, + { + "epoch": 0.18609742747673783, + "grad_norm": 0.3872479498386383, + "learning_rate": 0.0002, + "loss": 1.2923, + "step": 170 + }, + { + "epoch": 0.19704433497536947, + "grad_norm": 0.38654500246047974, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 180 + }, + { + "epoch": 0.20799124247400108, + "grad_norm": 0.2913552522659302, + "learning_rate": 0.0002, + "loss": 1.066, + "step": 190 + }, + { + "epoch": 0.21893814997263272, + "grad_norm": 0.2960572838783264, + "learning_rate": 0.0002, + "loss": 1.0868, + "step": 200 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 0.5175501108169556, + "learning_rate": 0.0002, + "loss": 1.136, + "step": 210 + }, + { + "epoch": 0.240831964969896, + "grad_norm": 1.2921574115753174, + "learning_rate": 0.0002, + "loss": 1.1556, + "step": 220 + }, + { + "epoch": 0.25177887246852765, + "grad_norm": 0.2675512135028839, + "learning_rate": 0.0002, + "loss": 1.1488, + "step": 230 + }, + { + "epoch": 0.2627257799671593, + "grad_norm": 0.3956190049648285, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 240 + }, + { + "epoch": 0.27367268746579093, + "grad_norm": 0.6022581458091736, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 250 + }, + { + "epoch": 0.2846195949644226, + "grad_norm": 1.1949563026428223, + "learning_rate": 0.0002, + "loss": 1.1981, + "step": 260 + }, + { + "epoch": 0.2955665024630542, + "grad_norm": 0.31173548102378845, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 270 + }, + { + "epoch": 0.3065134099616858, + "grad_norm": 0.2808472812175751, + "learning_rate": 0.0002, + "loss": 1.06, + "step": 280 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 0.3042023777961731, + "learning_rate": 0.0002, + "loss": 1.1752, + "step": 290 + }, + { + "epoch": 0.3284072249589491, + "grad_norm": 0.39915043115615845, + "learning_rate": 0.0002, + "loss": 1.3147, + "step": 300 + }, + { + "epoch": 0.3393541324575807, + "grad_norm": 0.39118197560310364, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 310 + }, + { + "epoch": 0.35030103995621237, + "grad_norm": 0.355010986328125, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 320 + }, + { + "epoch": 0.361247947454844, + "grad_norm": 0.29734086990356445, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 330 + }, + { + "epoch": 0.37219485495347565, + "grad_norm": 0.346096009016037, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 340 + }, + { + "epoch": 0.3831417624521073, + "grad_norm": 0.4829643666744232, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 350 + }, + { + "epoch": 0.39408866995073893, + "grad_norm": 0.4726872742176056, + "learning_rate": 0.0002, + "loss": 1.2808, + "step": 360 + }, + { + "epoch": 0.4050355774493706, + "grad_norm": 0.3130153715610504, + "learning_rate": 0.0002, + "loss": 1.1532, + "step": 370 + }, + { + "epoch": 0.41598248494800216, + "grad_norm": 0.5123590230941772, + "learning_rate": 0.0002, + "loss": 1.1842, + "step": 380 + }, + { + "epoch": 0.4269293924466338, + "grad_norm": 0.3444574773311615, + "learning_rate": 0.0002, + "loss": 1.1539, + "step": 390 + }, + { + "epoch": 0.43787629994526545, + "grad_norm": 0.5302175283432007, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 400 + }, + { + "epoch": 0.4488232074438971, + "grad_norm": 0.2713572680950165, + "learning_rate": 0.0002, + "loss": 1.1138, + "step": 410 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 0.29530611634254456, + "learning_rate": 0.0002, + "loss": 1.1281, + "step": 420 + }, + { + "epoch": 0.47071702244116037, + "grad_norm": 0.27282455563545227, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 430 + }, + { + "epoch": 0.481663929939792, + "grad_norm": 0.2647949755191803, + "learning_rate": 0.0002, + "loss": 1.1213, + "step": 440 + }, + { + "epoch": 0.49261083743842365, + "grad_norm": 0.35509347915649414, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 450 + }, + { + "epoch": 0.5035577449370553, + "grad_norm": 0.1959609091281891, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 460 + }, + { + "epoch": 0.5145046524356869, + "grad_norm": 0.40090155601501465, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 470 + }, + { + "epoch": 0.5254515599343186, + "grad_norm": 0.3354604244232178, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 480 + }, + { + "epoch": 0.5363984674329502, + "grad_norm": 0.2758506238460541, + "learning_rate": 0.0002, + "loss": 1.1024, + "step": 490 + }, + { + "epoch": 0.5473453749315819, + "grad_norm": 0.3629051744937897, + "learning_rate": 0.0002, + "loss": 1.1108, + "step": 500 + }, + { + "epoch": 0.5582922824302134, + "grad_norm": 0.30802229046821594, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 510 + }, + { + "epoch": 0.5692391899288451, + "grad_norm": 0.3099463880062103, + "learning_rate": 0.0002, + "loss": 1.0424, + "step": 520 + }, + { + "epoch": 0.5801860974274767, + "grad_norm": 0.42299067974090576, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 530 + }, + { + "epoch": 0.5911330049261084, + "grad_norm": 0.5392252802848816, + "learning_rate": 0.0002, + "loss": 1.1698, + "step": 540 + }, + { + "epoch": 0.60207991242474, + "grad_norm": 0.34768250584602356, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 550 + }, + { + "epoch": 0.6130268199233716, + "grad_norm": 0.28490015864372253, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 560 + }, + { + "epoch": 0.6239737274220033, + "grad_norm": 0.34787994623184204, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 570 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.29058772325515747, + "learning_rate": 0.0002, + "loss": 1.0759, + "step": 580 + }, + { + "epoch": 0.6458675424192666, + "grad_norm": 0.4063778817653656, + "learning_rate": 0.0002, + "loss": 1.1157, + "step": 590 + }, + { + "epoch": 0.6568144499178982, + "grad_norm": 0.9244267344474792, + "learning_rate": 0.0002, + "loss": 1.1432, + "step": 600 + }, + { + "epoch": 0.6677613574165299, + "grad_norm": 0.27605190873146057, + "learning_rate": 0.0002, + "loss": 1.0591, + "step": 610 + }, + { + "epoch": 0.6787082649151615, + "grad_norm": 0.34346821904182434, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 620 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.3093279302120209, + "learning_rate": 0.0002, + "loss": 1.2195, + "step": 630 + }, + { + "epoch": 0.7006020799124247, + "grad_norm": 1.0069009065628052, + "learning_rate": 0.0002, + "loss": 1.2461, + "step": 640 + }, + { + "epoch": 0.7115489874110563, + "grad_norm": 0.5049130916595459, + "learning_rate": 0.0002, + "loss": 1.0892, + "step": 650 + }, + { + "epoch": 0.722495894909688, + "grad_norm": 0.3748924732208252, + "learning_rate": 0.0002, + "loss": 1.1404, + "step": 660 + }, + { + "epoch": 0.7334428024083196, + "grad_norm": 0.2964959144592285, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 670 + }, + { + "epoch": 0.7443897099069513, + "grad_norm": 0.4599986970424652, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 680 + }, + { + "epoch": 0.7553366174055829, + "grad_norm": 0.27292951941490173, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 690 + }, + { + "epoch": 0.7662835249042146, + "grad_norm": 0.3123566806316376, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 700 + }, + { + "epoch": 0.7772304324028462, + "grad_norm": 0.28310710191726685, + "learning_rate": 0.0002, + "loss": 1.0021, + "step": 710 + }, + { + "epoch": 0.7881773399014779, + "grad_norm": 0.3279992341995239, + "learning_rate": 0.0002, + "loss": 1.2281, + "step": 720 + }, + { + "epoch": 0.7991242474001095, + "grad_norm": 0.28179168701171875, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 730 + }, + { + "epoch": 0.8100711548987412, + "grad_norm": 0.31492987275123596, + "learning_rate": 0.0002, + "loss": 1.0602, + "step": 740 + }, + { + "epoch": 0.8210180623973727, + "grad_norm": 0.41821011900901794, + "learning_rate": 0.0002, + "loss": 1.2518, + "step": 750 + }, + { + "epoch": 0.8319649698960043, + "grad_norm": 0.325235515832901, + "learning_rate": 0.0002, + "loss": 1.1612, + "step": 760 + }, + { + "epoch": 0.842911877394636, + "grad_norm": 0.5366070866584778, + "learning_rate": 0.0002, + "loss": 1.27, + "step": 770 + }, + { + "epoch": 0.8538587848932676, + "grad_norm": 0.32570579648017883, + "learning_rate": 0.0002, + "loss": 1.0921, + "step": 780 + }, + { + "epoch": 0.8648056923918993, + "grad_norm": 0.3642968237400055, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 790 + }, + { + "epoch": 0.8757525998905309, + "grad_norm": 0.29713448882102966, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 800 + }, + { + "epoch": 0.8866995073891626, + "grad_norm": 0.23599444329738617, + "learning_rate": 0.0002, + "loss": 1.0978, + "step": 810 + }, + { + "epoch": 0.8976464148877942, + "grad_norm": 0.31522464752197266, + "learning_rate": 0.0002, + "loss": 1.1867, + "step": 820 + }, + { + "epoch": 0.9085933223864259, + "grad_norm": 0.32754790782928467, + "learning_rate": 0.0002, + "loss": 1.0208, + "step": 830 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 0.22741089761257172, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 840 + }, + { + "epoch": 0.9304871373836892, + "grad_norm": 0.3089679777622223, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 850 + }, + { + "epoch": 0.9414340448823207, + "grad_norm": 0.27440521121025085, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 860 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.3498363792896271, + "learning_rate": 0.0002, + "loss": 1.0417, + "step": 870 + }, + { + "epoch": 0.963327859879584, + "grad_norm": 0.47151854634284973, + "learning_rate": 0.0002, + "loss": 1.269, + "step": 880 + }, + { + "epoch": 0.9742747673782156, + "grad_norm": 0.24756591022014618, + "learning_rate": 0.0002, + "loss": 1.1174, + "step": 890 + }, + { + "epoch": 0.9852216748768473, + "grad_norm": 0.2600938677787781, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 900 + }, + { + "epoch": 0.9961685823754789, + "grad_norm": 0.2934586703777313, + "learning_rate": 0.0002, + "loss": 1.0968, + "step": 910 + }, + { + "epoch": 0.9994526546250684, + "eval_loss": 1.158464789390564, + "eval_runtime": 46.0774, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 1.194, + "step": 913 + }, + { + "epoch": 1.0071154898741106, + "grad_norm": 0.37776654958724976, + "learning_rate": 0.0002, + "loss": 1.0322, + "step": 920 + }, + { + "epoch": 1.0180623973727423, + "grad_norm": 0.31784629821777344, + "learning_rate": 0.0002, + "loss": 1.1346, + "step": 930 + }, + { + "epoch": 1.0290093048713738, + "grad_norm": 0.24244336783885956, + "learning_rate": 0.0002, + "loss": 0.8976, + "step": 940 + }, + { + "epoch": 1.0399562123700055, + "grad_norm": 0.3185454308986664, + "learning_rate": 0.0002, + "loss": 1.0172, + "step": 950 + }, + { + "epoch": 1.0509031198686372, + "grad_norm": 0.3589441478252411, + "learning_rate": 0.0002, + "loss": 1.1645, + "step": 960 + }, + { + "epoch": 1.0618500273672686, + "grad_norm": 0.38593578338623047, + "learning_rate": 0.0002, + "loss": 1.0317, + "step": 970 + }, + { + "epoch": 1.0727969348659003, + "grad_norm": 0.39694955945014954, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 980 + }, + { + "epoch": 1.083743842364532, + "grad_norm": 0.469817191362381, + "learning_rate": 0.0002, + "loss": 1.0893, + "step": 990 + }, + { + "epoch": 1.0946907498631637, + "grad_norm": 0.2634755074977875, + "learning_rate": 0.0002, + "loss": 0.983, + "step": 1000 + }, + { + "epoch": 1.1056376573617952, + "grad_norm": 0.43189436197280884, + "learning_rate": 0.0002, + "loss": 1.0144, + "step": 1010 + }, + { + "epoch": 1.116584564860427, + "grad_norm": 0.5559977889060974, + "learning_rate": 0.0002, + "loss": 0.9663, + "step": 1020 + }, + { + "epoch": 1.1275314723590586, + "grad_norm": 0.32100191712379456, + "learning_rate": 0.0002, + "loss": 1.0481, + "step": 1030 + }, + { + "epoch": 1.1384783798576903, + "grad_norm": 0.40179768204689026, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 1040 + }, + { + "epoch": 1.1494252873563218, + "grad_norm": 0.3659493029117584, + "learning_rate": 0.0002, + "loss": 1.0029, + "step": 1050 + }, + { + "epoch": 1.1603721948549535, + "grad_norm": 0.701704204082489, + "learning_rate": 0.0002, + "loss": 0.9597, + "step": 1060 + }, + { + "epoch": 1.1713191023535852, + "grad_norm": 0.3650563359260559, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 1070 + }, + { + "epoch": 1.1822660098522166, + "grad_norm": 0.3191976249217987, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 1080 + }, + { + "epoch": 1.1932129173508483, + "grad_norm": 0.3615441918373108, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 1090 + }, + { + "epoch": 1.20415982484948, + "grad_norm": 0.39474231004714966, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 1100 + }, + { + "epoch": 1.2151067323481117, + "grad_norm": 0.3752822279930115, + "learning_rate": 0.0002, + "loss": 0.9852, + "step": 1110 + }, + { + "epoch": 1.2260536398467432, + "grad_norm": 0.4165991246700287, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 1120 + }, + { + "epoch": 1.237000547345375, + "grad_norm": 0.5326506495475769, + "learning_rate": 0.0002, + "loss": 1.0244, + "step": 1130 + }, + { + "epoch": 1.2479474548440066, + "grad_norm": 0.48845794796943665, + "learning_rate": 0.0002, + "loss": 1.0542, + "step": 1140 + }, + { + "epoch": 1.2588943623426383, + "grad_norm": 0.29910150170326233, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 1150 + }, + { + "epoch": 1.2698412698412698, + "grad_norm": 0.5069725513458252, + "learning_rate": 0.0002, + "loss": 1.2233, + "step": 1160 + }, + { + "epoch": 1.2807881773399015, + "grad_norm": 0.29500406980514526, + "learning_rate": 0.0002, + "loss": 1.0992, + "step": 1170 + }, + { + "epoch": 1.2917350848385332, + "grad_norm": 0.4711538851261139, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 1180 + }, + { + "epoch": 1.3026819923371646, + "grad_norm": 0.4203340709209442, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 1190 + }, + { + "epoch": 1.3136288998357963, + "grad_norm": 0.36101874709129333, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 1200 + }, + { + "epoch": 1.324575807334428, + "grad_norm": 0.4608800411224365, + "learning_rate": 0.0002, + "loss": 1.085, + "step": 1210 + }, + { + "epoch": 1.3355227148330597, + "grad_norm": 0.6570906639099121, + "learning_rate": 0.0002, + "loss": 1.0695, + "step": 1220 + }, + { + "epoch": 1.3464696223316914, + "grad_norm": 0.5352164506912231, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 1230 + }, + { + "epoch": 1.357416529830323, + "grad_norm": 0.3885001242160797, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 1240 + }, + { + "epoch": 1.3683634373289546, + "grad_norm": 0.2987913489341736, + "learning_rate": 0.0002, + "loss": 1.0611, + "step": 1250 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.42070427536964417, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 1260 + }, + { + "epoch": 1.3902572523262178, + "grad_norm": 0.5957782864570618, + "learning_rate": 0.0002, + "loss": 1.1092, + "step": 1270 + }, + { + "epoch": 1.4012041598248495, + "grad_norm": 0.32898882031440735, + "learning_rate": 0.0002, + "loss": 1.0554, + "step": 1280 + }, + { + "epoch": 1.4121510673234812, + "grad_norm": 0.27624452114105225, + "learning_rate": 0.0002, + "loss": 0.909, + "step": 1290 + }, + { + "epoch": 1.4230979748221126, + "grad_norm": 0.49570828676223755, + "learning_rate": 0.0002, + "loss": 1.024, + "step": 1300 + }, + { + "epoch": 1.4340448823207443, + "grad_norm": 0.26191383600234985, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 1310 + }, + { + "epoch": 1.444991789819376, + "grad_norm": 0.35664042830467224, + "learning_rate": 0.0002, + "loss": 1.0788, + "step": 1320 + }, + { + "epoch": 1.4559386973180077, + "grad_norm": 0.45126354694366455, + "learning_rate": 0.0002, + "loss": 1.0108, + "step": 1330 + }, + { + "epoch": 1.4668856048166394, + "grad_norm": 0.37318357825279236, + "learning_rate": 0.0002, + "loss": 0.9571, + "step": 1340 + }, + { + "epoch": 1.477832512315271, + "grad_norm": 0.6428970098495483, + "learning_rate": 0.0002, + "loss": 1.0507, + "step": 1350 + }, + { + "epoch": 1.4887794198139026, + "grad_norm": 0.43256187438964844, + "learning_rate": 0.0002, + "loss": 1.0367, + "step": 1360 + }, + { + "epoch": 1.4997263273125343, + "grad_norm": 0.5343793630599976, + "learning_rate": 0.0002, + "loss": 1.1321, + "step": 1370 + }, + { + "epoch": 1.5106732348111658, + "grad_norm": 0.315437376499176, + "learning_rate": 0.0002, + "loss": 1.1054, + "step": 1380 + }, + { + "epoch": 1.5216201423097975, + "grad_norm": 0.41561153531074524, + "learning_rate": 0.0002, + "loss": 0.8916, + "step": 1390 + }, + { + "epoch": 1.5325670498084292, + "grad_norm": 0.3201070725917816, + "learning_rate": 0.0002, + "loss": 1.0391, + "step": 1400 + }, + { + "epoch": 1.5435139573070606, + "grad_norm": 0.505537211894989, + "learning_rate": 0.0002, + "loss": 1.14, + "step": 1410 + }, + { + "epoch": 1.5544608648056923, + "grad_norm": 0.3747410178184509, + "learning_rate": 0.0002, + "loss": 1.0775, + "step": 1420 + }, + { + "epoch": 1.565407772304324, + "grad_norm": 0.49385908246040344, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 1430 + }, + { + "epoch": 1.5763546798029555, + "grad_norm": 0.49831628799438477, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 1440 + }, + { + "epoch": 1.5873015873015874, + "grad_norm": 0.372127890586853, + "learning_rate": 0.0002, + "loss": 1.0079, + "step": 1450 + }, + { + "epoch": 1.598248494800219, + "grad_norm": 0.40070840716362, + "learning_rate": 0.0002, + "loss": 0.9931, + "step": 1460 + }, + { + "epoch": 1.6091954022988506, + "grad_norm": 0.34907400608062744, + "learning_rate": 0.0002, + "loss": 0.8954, + "step": 1470 + }, + { + "epoch": 1.6201423097974823, + "grad_norm": 0.4632418751716614, + "learning_rate": 0.0002, + "loss": 0.9743, + "step": 1480 + }, + { + "epoch": 1.6310892172961138, + "grad_norm": 0.40164515376091003, + "learning_rate": 0.0002, + "loss": 1.0103, + "step": 1490 + }, + { + "epoch": 1.6420361247947455, + "grad_norm": 0.3214994966983795, + "learning_rate": 0.0002, + "loss": 0.9523, + "step": 1500 + }, + { + "epoch": 1.6529830322933772, + "grad_norm": 0.3727897107601166, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 1510 + }, + { + "epoch": 1.6639299397920086, + "grad_norm": 0.3817640542984009, + "learning_rate": 0.0002, + "loss": 1.0443, + "step": 1520 + }, + { + "epoch": 1.6748768472906403, + "grad_norm": 0.5592136979103088, + "learning_rate": 0.0002, + "loss": 1.0511, + "step": 1530 + }, + { + "epoch": 1.685823754789272, + "grad_norm": 0.44636598229408264, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 1540 + }, + { + "epoch": 1.6967706622879035, + "grad_norm": 0.40441709756851196, + "learning_rate": 0.0002, + "loss": 1.033, + "step": 1550 + }, + { + "epoch": 1.7077175697865354, + "grad_norm": 0.3243522644042969, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 1560 + }, + { + "epoch": 1.718664477285167, + "grad_norm": 0.34277570247650146, + "learning_rate": 0.0002, + "loss": 0.9482, + "step": 1570 + }, + { + "epoch": 1.7296113847837986, + "grad_norm": 0.3279995024204254, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 1580 + }, + { + "epoch": 1.7405582922824303, + "grad_norm": 0.41968777775764465, + "learning_rate": 0.0002, + "loss": 0.9414, + "step": 1590 + }, + { + "epoch": 1.7515051997810618, + "grad_norm": 0.39464613795280457, + "learning_rate": 0.0002, + "loss": 0.9768, + "step": 1600 + }, + { + "epoch": 1.7624521072796935, + "grad_norm": 0.3839009404182434, + "learning_rate": 0.0002, + "loss": 1.0347, + "step": 1610 + }, + { + "epoch": 1.7733990147783252, + "grad_norm": 0.3250715434551239, + "learning_rate": 0.0002, + "loss": 0.9195, + "step": 1620 + }, + { + "epoch": 1.7843459222769567, + "grad_norm": 0.5166561007499695, + "learning_rate": 0.0002, + "loss": 1.0049, + "step": 1630 + }, + { + "epoch": 1.7952928297755884, + "grad_norm": 0.4115183353424072, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 1640 + }, + { + "epoch": 1.80623973727422, + "grad_norm": 0.373780220746994, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 1650 + }, + { + "epoch": 1.8171866447728515, + "grad_norm": 0.49697014689445496, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 1660 + }, + { + "epoch": 1.8281335522714834, + "grad_norm": 1.0308938026428223, + "learning_rate": 0.0002, + "loss": 1.0089, + "step": 1670 + }, + { + "epoch": 1.839080459770115, + "grad_norm": 0.4851366877555847, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 1680 + }, + { + "epoch": 1.8500273672687466, + "grad_norm": 0.3262481391429901, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 1690 + }, + { + "epoch": 1.8609742747673783, + "grad_norm": 0.6904496550559998, + "learning_rate": 0.0002, + "loss": 1.048, + "step": 1700 + }, + { + "epoch": 1.8719211822660098, + "grad_norm": 0.49789851903915405, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 1710 + }, + { + "epoch": 1.8828680897646415, + "grad_norm": 0.3035794198513031, + "learning_rate": 0.0002, + "loss": 1.0258, + "step": 1720 + }, + { + "epoch": 1.8938149972632732, + "grad_norm": 0.4588414430618286, + "learning_rate": 0.0002, + "loss": 0.9916, + "step": 1730 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.4313034117221832, + "learning_rate": 0.0002, + "loss": 0.9526, + "step": 1740 + }, + { + "epoch": 1.9157088122605364, + "grad_norm": 0.38562044501304626, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 1750 + }, + { + "epoch": 1.926655719759168, + "grad_norm": 0.46947410702705383, + "learning_rate": 0.0002, + "loss": 0.9763, + "step": 1760 + }, + { + "epoch": 1.9376026272577995, + "grad_norm": 0.3848404884338379, + "learning_rate": 0.0002, + "loss": 0.9967, + "step": 1770 + }, + { + "epoch": 1.9485495347564314, + "grad_norm": 0.30422744154930115, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 1780 + }, + { + "epoch": 1.959496442255063, + "grad_norm": 0.41100990772247314, + "learning_rate": 0.0002, + "loss": 1.022, + "step": 1790 + }, + { + "epoch": 1.9704433497536946, + "grad_norm": 0.3492335081100464, + "learning_rate": 0.0002, + "loss": 1.003, + "step": 1800 + }, + { + "epoch": 1.9813902572523263, + "grad_norm": 0.364577978849411, + "learning_rate": 0.0002, + "loss": 0.9891, + "step": 1810 + }, + { + "epoch": 1.9923371647509578, + "grad_norm": 0.4312075674533844, + "learning_rate": 0.0002, + "loss": 1.0699, + "step": 1820 + }, + { + "epoch": 2.0, + "eval_loss": 1.14472496509552, + "eval_runtime": 46.0786, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 1.194, + "step": 1827 + }, + { + "epoch": 2.0032840722495897, + "grad_norm": 0.5989689230918884, + "learning_rate": 0.0002, + "loss": 1.0218, + "step": 1830 + }, + { + "epoch": 2.014230979748221, + "grad_norm": 0.49720922112464905, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 1840 + }, + { + "epoch": 2.0251778872468527, + "grad_norm": 0.42675456404685974, + "learning_rate": 0.0002, + "loss": 0.8482, + "step": 1850 + }, + { + "epoch": 2.0361247947454846, + "grad_norm": 0.4637208580970764, + "learning_rate": 0.0002, + "loss": 0.8487, + "step": 1860 + }, + { + "epoch": 2.047071702244116, + "grad_norm": 0.8329976797103882, + "learning_rate": 0.0002, + "loss": 0.8212, + "step": 1870 + }, + { + "epoch": 2.0580186097427475, + "grad_norm": 0.7869427800178528, + "learning_rate": 0.0002, + "loss": 0.9547, + "step": 1880 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.4927455186843872, + "learning_rate": 0.0002, + "loss": 0.9351, + "step": 1890 + }, + { + "epoch": 2.079912424740011, + "grad_norm": 0.6264246702194214, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 1900 + }, + { + "epoch": 2.0908593322386424, + "grad_norm": 1.1164122819900513, + "learning_rate": 0.0002, + "loss": 0.9095, + "step": 1910 + }, + { + "epoch": 2.1018062397372743, + "grad_norm": 0.5283981561660767, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 1920 + }, + { + "epoch": 2.112753147235906, + "grad_norm": 0.45621731877326965, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 1930 + }, + { + "epoch": 2.1237000547345373, + "grad_norm": 1.381791591644287, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 1940 + }, + { + "epoch": 2.134646962233169, + "grad_norm": 0.5151259899139404, + "learning_rate": 0.0002, + "loss": 0.9006, + "step": 1950 + }, + { + "epoch": 2.1455938697318007, + "grad_norm": 0.9806339740753174, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 1960 + }, + { + "epoch": 2.1565407772304326, + "grad_norm": 0.4734154939651489, + "learning_rate": 0.0002, + "loss": 0.8749, + "step": 1970 + }, + { + "epoch": 2.167487684729064, + "grad_norm": 0.9553168416023254, + "learning_rate": 0.0002, + "loss": 0.9172, + "step": 1980 + }, + { + "epoch": 2.1784345922276955, + "grad_norm": 0.5895838141441345, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1990 + }, + { + "epoch": 2.1893814997263275, + "grad_norm": 0.4488855302333832, + "learning_rate": 0.0002, + "loss": 0.7841, + "step": 2000 + }, + { + "epoch": 2.200328407224959, + "grad_norm": 1.0760235786437988, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 2010 + }, + { + "epoch": 2.2112753147235904, + "grad_norm": 0.5038785338401794, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 2020 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.59819495677948, + "learning_rate": 0.0002, + "loss": 0.8973, + "step": 2030 + }, + { + "epoch": 2.233169129720854, + "grad_norm": 0.5012075304985046, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 2040 + }, + { + "epoch": 2.2441160372194853, + "grad_norm": 0.44978439807891846, + "learning_rate": 0.0002, + "loss": 0.9087, + "step": 2050 + }, + { + "epoch": 2.255062944718117, + "grad_norm": 0.5350462198257446, + "learning_rate": 0.0002, + "loss": 0.823, + "step": 2060 + }, + { + "epoch": 2.2660098522167487, + "grad_norm": 0.6020669937133789, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 2070 + }, + { + "epoch": 2.2769567597153806, + "grad_norm": 0.5246821045875549, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 2080 + }, + { + "epoch": 2.287903667214012, + "grad_norm": 0.5711268782615662, + "learning_rate": 0.0002, + "loss": 0.8984, + "step": 2090 + }, + { + "epoch": 2.2988505747126435, + "grad_norm": 0.617317259311676, + "learning_rate": 0.0002, + "loss": 0.9093, + "step": 2100 + }, + { + "epoch": 2.3097974822112755, + "grad_norm": 0.8608947396278381, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 2110 + }, + { + "epoch": 2.320744389709907, + "grad_norm": 0.4739076793193817, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 2120 + }, + { + "epoch": 2.3316912972085384, + "grad_norm": 0.5538856983184814, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 2130 + }, + { + "epoch": 2.3426382047071703, + "grad_norm": 0.6064935326576233, + "learning_rate": 0.0002, + "loss": 0.8994, + "step": 2140 + }, + { + "epoch": 2.353585112205802, + "grad_norm": 0.5019068121910095, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 2150 + }, + { + "epoch": 2.3645320197044333, + "grad_norm": 0.45340514183044434, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 2160 + }, + { + "epoch": 2.375478927203065, + "grad_norm": 0.7347203493118286, + "learning_rate": 0.0002, + "loss": 0.9254, + "step": 2170 + }, + { + "epoch": 2.3864258347016967, + "grad_norm": 0.46922534704208374, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 2180 + }, + { + "epoch": 2.3973727422003286, + "grad_norm": 0.5507845878601074, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2190 + }, + { + "epoch": 2.40831964969896, + "grad_norm": 0.5621911883354187, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2200 + }, + { + "epoch": 2.4192665571975915, + "grad_norm": 0.5023514032363892, + "learning_rate": 0.0002, + "loss": 0.8569, + "step": 2210 + }, + { + "epoch": 2.4302134646962235, + "grad_norm": 0.6124861240386963, + "learning_rate": 0.0002, + "loss": 0.8355, + "step": 2220 + }, + { + "epoch": 2.441160372194855, + "grad_norm": 0.49614205956459045, + "learning_rate": 0.0002, + "loss": 0.885, + "step": 2230 + }, + { + "epoch": 2.4521072796934864, + "grad_norm": 0.6477900743484497, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 2240 + }, + { + "epoch": 2.4630541871921183, + "grad_norm": 0.5868843793869019, + "learning_rate": 0.0002, + "loss": 0.8622, + "step": 2250 + }, + { + "epoch": 2.47400109469075, + "grad_norm": 0.4364610016345978, + "learning_rate": 0.0002, + "loss": 0.8498, + "step": 2260 + }, + { + "epoch": 2.4849480021893813, + "grad_norm": 0.5792964696884155, + "learning_rate": 0.0002, + "loss": 0.8378, + "step": 2270 + }, + { + "epoch": 2.495894909688013, + "grad_norm": 0.5421269536018372, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 2280 + }, + { + "epoch": 2.5068418171866447, + "grad_norm": 0.5525493025779724, + "learning_rate": 0.0002, + "loss": 0.9637, + "step": 2290 + }, + { + "epoch": 2.5177887246852766, + "grad_norm": 0.6463850140571594, + "learning_rate": 0.0002, + "loss": 0.8075, + "step": 2300 + }, + { + "epoch": 2.528735632183908, + "grad_norm": 0.6861311793327332, + "learning_rate": 0.0002, + "loss": 0.8591, + "step": 2310 + }, + { + "epoch": 2.5396825396825395, + "grad_norm": 0.5563555359840393, + "learning_rate": 0.0002, + "loss": 0.9287, + "step": 2320 + }, + { + "epoch": 2.5506294471811715, + "grad_norm": 0.5721169114112854, + "learning_rate": 0.0002, + "loss": 0.945, + "step": 2330 + }, + { + "epoch": 2.561576354679803, + "grad_norm": 0.5258274674415588, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 2340 + }, + { + "epoch": 2.572523262178435, + "grad_norm": 0.7057380676269531, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 2350 + }, + { + "epoch": 2.5834701696770663, + "grad_norm": 0.6869027614593506, + "learning_rate": 0.0002, + "loss": 0.8615, + "step": 2360 + }, + { + "epoch": 2.594417077175698, + "grad_norm": 0.4960809648036957, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2370 + }, + { + "epoch": 2.6053639846743293, + "grad_norm": 0.9288380146026611, + "learning_rate": 0.0002, + "loss": 0.8476, + "step": 2380 + }, + { + "epoch": 2.616310892172961, + "grad_norm": 0.3765334188938141, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 2390 + }, + { + "epoch": 2.6272577996715927, + "grad_norm": 0.7487865686416626, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 2400 + }, + { + "epoch": 2.6382047071702246, + "grad_norm": 0.6141156554222107, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 2410 + }, + { + "epoch": 2.649151614668856, + "grad_norm": 0.8420507907867432, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 2420 + }, + { + "epoch": 2.6600985221674875, + "grad_norm": 0.53386390209198, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 2430 + }, + { + "epoch": 2.6710454296661195, + "grad_norm": 0.5520607233047485, + "learning_rate": 0.0002, + "loss": 0.8486, + "step": 2440 + }, + { + "epoch": 2.681992337164751, + "grad_norm": 0.5337599515914917, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 2450 + }, + { + "epoch": 2.692939244663383, + "grad_norm": 0.48790836334228516, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 2460 + }, + { + "epoch": 2.7038861521620143, + "grad_norm": 0.8287786245346069, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 2470 + }, + { + "epoch": 2.714833059660646, + "grad_norm": 0.5876168608665466, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 2480 + }, + { + "epoch": 2.7257799671592773, + "grad_norm": 0.5206760764122009, + "learning_rate": 0.0002, + "loss": 0.8773, + "step": 2490 + }, + { + "epoch": 2.736726874657909, + "grad_norm": 0.5619136691093445, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 2500 + }, + { + "epoch": 2.7476737821565407, + "grad_norm": 0.5614883899688721, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 2510 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.6157700419425964, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2520 + }, + { + "epoch": 2.769567597153804, + "grad_norm": 0.5529953837394714, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 2530 + }, + { + "epoch": 2.7805145046524355, + "grad_norm": 0.6731224060058594, + "learning_rate": 0.0002, + "loss": 0.8881, + "step": 2540 + }, + { + "epoch": 2.7914614121510675, + "grad_norm": 0.6960386633872986, + "learning_rate": 0.0002, + "loss": 0.9125, + "step": 2550 + }, + { + "epoch": 2.802408319649699, + "grad_norm": 0.5203493237495422, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 2560 + }, + { + "epoch": 2.813355227148331, + "grad_norm": 1.036837100982666, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 2570 + }, + { + "epoch": 2.8243021346469623, + "grad_norm": 0.6125805377960205, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 2580 + }, + { + "epoch": 2.835249042145594, + "grad_norm": 0.6298092603683472, + "learning_rate": 0.0002, + "loss": 0.8939, + "step": 2590 + }, + { + "epoch": 2.8461959496442253, + "grad_norm": 0.5882203578948975, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2600 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.8619399666786194, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 2610 + }, + { + "epoch": 2.8680897646414887, + "grad_norm": 0.4722687304019928, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 2620 + }, + { + "epoch": 2.8790366721401206, + "grad_norm": 0.47399574518203735, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 2630 + }, + { + "epoch": 2.889983579638752, + "grad_norm": 0.5639172792434692, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2640 + }, + { + "epoch": 2.9009304871373836, + "grad_norm": 0.4676816761493683, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 2650 + }, + { + "epoch": 2.9118773946360155, + "grad_norm": 0.6906291246414185, + "learning_rate": 0.0002, + "loss": 0.8966, + "step": 2660 + }, + { + "epoch": 2.922824302134647, + "grad_norm": 0.4369746148586273, + "learning_rate": 0.0002, + "loss": 0.8966, + "step": 2670 + }, + { + "epoch": 2.933771209633279, + "grad_norm": 0.46423083543777466, + "learning_rate": 0.0002, + "loss": 0.9173, + "step": 2680 + }, + { + "epoch": 2.9447181171319103, + "grad_norm": 0.5700525045394897, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 2690 + }, + { + "epoch": 2.955665024630542, + "grad_norm": 0.6221476793289185, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 2700 + }, + { + "epoch": 2.9666119321291733, + "grad_norm": 0.6102682948112488, + "learning_rate": 0.0002, + "loss": 0.8538, + "step": 2710 + }, + { + "epoch": 2.977558839627805, + "grad_norm": 0.5317878723144531, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 2720 + }, + { + "epoch": 2.9885057471264367, + "grad_norm": 0.4438510835170746, + "learning_rate": 0.0002, + "loss": 0.9314, + "step": 2730 + }, + { + "epoch": 2.9994526546250686, + "grad_norm": 0.5022130012512207, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 2740 + }, + { + "epoch": 2.9994526546250686, + "eval_loss": 1.1722838878631592, + "eval_runtime": 46.0829, + "eval_samples_per_second": 9.461, + "eval_steps_per_second": 1.194, + "step": 2740 + }, + { + "epoch": 3.0103995621237, + "grad_norm": 0.6384502053260803, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 2750 + }, + { + "epoch": 3.0213464696223316, + "grad_norm": 0.9928722381591797, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 2760 + }, + { + "epoch": 3.0322933771209635, + "grad_norm": 0.7813051342964172, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 2770 + }, + { + "epoch": 3.043240284619595, + "grad_norm": 1.0202556848526, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 2780 + }, + { + "epoch": 3.0541871921182264, + "grad_norm": 0.7581062316894531, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 2790 + }, + { + "epoch": 3.0651340996168583, + "grad_norm": 0.6252710223197937, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 2800 + }, + { + "epoch": 3.07608100711549, + "grad_norm": 0.7738662958145142, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 2810 + }, + { + "epoch": 3.0870279146141213, + "grad_norm": 0.7381885051727295, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 2820 + }, + { + "epoch": 3.097974822112753, + "grad_norm": 0.9197564721107483, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 2830 + }, + { + "epoch": 3.1089217296113847, + "grad_norm": 1.000976800918579, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 2840 + }, + { + "epoch": 3.1198686371100166, + "grad_norm": 0.7559131383895874, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 2850 + }, + { + "epoch": 3.130815544608648, + "grad_norm": 0.7213780879974365, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 2860 + }, + { + "epoch": 3.1417624521072796, + "grad_norm": 0.945939838886261, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 2870 + }, + { + "epoch": 3.1527093596059115, + "grad_norm": 0.7277454137802124, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 2880 + }, + { + "epoch": 3.163656267104543, + "grad_norm": 0.762026846408844, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 2890 + }, + { + "epoch": 3.1746031746031744, + "grad_norm": 0.6471221446990967, + "learning_rate": 0.0002, + "loss": 0.5332, + "step": 2900 + }, + { + "epoch": 3.1855500821018063, + "grad_norm": 0.6018978357315063, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 2910 + }, + { + "epoch": 3.196496989600438, + "grad_norm": 0.8607320785522461, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 2920 + }, + { + "epoch": 3.2074438970990693, + "grad_norm": 0.8854126334190369, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 2930 + }, + { + "epoch": 3.218390804597701, + "grad_norm": 0.6620870232582092, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 2940 + }, + { + "epoch": 3.2293377120963327, + "grad_norm": 0.7377511858940125, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 2950 + }, + { + "epoch": 3.2402846195949646, + "grad_norm": 0.7803301811218262, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 2960 + }, + { + "epoch": 3.251231527093596, + "grad_norm": 0.834061861038208, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 2970 + }, + { + "epoch": 3.2621784345922276, + "grad_norm": 0.8496041893959045, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 2980 + }, + { + "epoch": 3.2731253420908595, + "grad_norm": 0.7967984676361084, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 2990 + }, + { + "epoch": 3.284072249589491, + "grad_norm": 1.0207016468048096, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 3000 + }, + { + "epoch": 3.2950191570881224, + "grad_norm": 0.7855771780014038, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 3010 + }, + { + "epoch": 3.3059660645867543, + "grad_norm": 0.9427546858787537, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 3020 + }, + { + "epoch": 3.316912972085386, + "grad_norm": 0.823542594909668, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 3030 + }, + { + "epoch": 3.3278598795840173, + "grad_norm": 0.9826635122299194, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 3040 + }, + { + "epoch": 3.338806787082649, + "grad_norm": 0.7259827852249146, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 3050 + }, + { + "epoch": 3.3497536945812807, + "grad_norm": 0.7774739861488342, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 3060 + }, + { + "epoch": 3.3607006020799126, + "grad_norm": 0.7394293546676636, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 3070 + }, + { + "epoch": 3.371647509578544, + "grad_norm": 0.9017578959465027, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 3080 + }, + { + "epoch": 3.3825944170771756, + "grad_norm": 0.7451054453849792, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 3090 + }, + { + "epoch": 3.3935413245758075, + "grad_norm": 0.7321506142616272, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 3100 + }, + { + "epoch": 3.404488232074439, + "grad_norm": 0.6721828579902649, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 3110 + }, + { + "epoch": 3.4154351395730704, + "grad_norm": 0.774022102355957, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 3120 + }, + { + "epoch": 3.4263820470717024, + "grad_norm": 0.9143537282943726, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 3130 + }, + { + "epoch": 3.437328954570334, + "grad_norm": 1.226087212562561, + "learning_rate": 0.0002, + "loss": 0.6899, + "step": 3140 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 0.7545496225357056, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 3150 + }, + { + "epoch": 3.4592227695675972, + "grad_norm": 0.6515635848045349, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 3160 + }, + { + "epoch": 3.4701696770662287, + "grad_norm": 0.9297090172767639, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 3170 + }, + { + "epoch": 3.4811165845648606, + "grad_norm": 1.0130730867385864, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 3180 + }, + { + "epoch": 3.492063492063492, + "grad_norm": 0.7654589414596558, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 3190 + }, + { + "epoch": 3.5030103995621236, + "grad_norm": 0.9954977631568909, + "learning_rate": 0.0002, + "loss": 0.7401, + "step": 3200 + }, + { + "epoch": 3.5139573070607555, + "grad_norm": 0.6027487516403198, + "learning_rate": 0.0002, + "loss": 0.6661, + "step": 3210 + }, + { + "epoch": 3.524904214559387, + "grad_norm": 0.741770327091217, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 3220 + }, + { + "epoch": 3.535851122058019, + "grad_norm": 1.0534909963607788, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 3230 + }, + { + "epoch": 3.5467980295566504, + "grad_norm": 0.937772274017334, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 3240 + }, + { + "epoch": 3.557744937055282, + "grad_norm": 0.8504213690757751, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 3250 + }, + { + "epoch": 3.5686918445539133, + "grad_norm": 0.7755007147789001, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 3260 + }, + { + "epoch": 3.5796387520525452, + "grad_norm": 1.0193358659744263, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 3270 + }, + { + "epoch": 3.5905856595511767, + "grad_norm": 0.8440536856651306, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 3280 + }, + { + "epoch": 3.6015325670498086, + "grad_norm": 0.6195939183235168, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 3290 + }, + { + "epoch": 3.61247947454844, + "grad_norm": 0.8608590960502625, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 3300 + }, + { + "epoch": 3.6234263820470716, + "grad_norm": 0.6772327423095703, + "learning_rate": 0.0002, + "loss": 0.7301, + "step": 3310 + }, + { + "epoch": 3.6343732895457035, + "grad_norm": 0.8031839728355408, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 3320 + }, + { + "epoch": 3.645320197044335, + "grad_norm": 0.6080502271652222, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 3330 + }, + { + "epoch": 3.656267104542967, + "grad_norm": 0.8007240891456604, + "learning_rate": 0.0002, + "loss": 0.7431, + "step": 3340 + }, + { + "epoch": 3.6672140120415984, + "grad_norm": 0.8060704469680786, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 3350 + }, + { + "epoch": 3.67816091954023, + "grad_norm": 0.7547586560249329, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 3360 + }, + { + "epoch": 3.6891078270388613, + "grad_norm": 0.686851978302002, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 3370 + }, + { + "epoch": 3.7000547345374932, + "grad_norm": 0.9429075717926025, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 3380 + }, + { + "epoch": 3.7110016420361247, + "grad_norm": 0.7283591032028198, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 3390 + }, + { + "epoch": 3.7219485495347566, + "grad_norm": 0.8323085904121399, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 3400 + }, + { + "epoch": 3.732895457033388, + "grad_norm": 0.8529590964317322, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 3410 + }, + { + "epoch": 3.7438423645320196, + "grad_norm": 0.731752872467041, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 3420 + }, + { + "epoch": 3.7547892720306515, + "grad_norm": 0.8572278618812561, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3430 + }, + { + "epoch": 3.765736179529283, + "grad_norm": 0.7408691048622131, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 3440 + }, + { + "epoch": 3.776683087027915, + "grad_norm": 0.7470445036888123, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 3450 + }, + { + "epoch": 3.7876299945265464, + "grad_norm": 0.6806244254112244, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 3460 + }, + { + "epoch": 3.798576902025178, + "grad_norm": 0.9129069447517395, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 3470 + }, + { + "epoch": 3.8095238095238093, + "grad_norm": 0.8717501759529114, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 3480 + }, + { + "epoch": 3.8204707170224412, + "grad_norm": 0.6761979460716248, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 3490 + }, + { + "epoch": 3.8314176245210727, + "grad_norm": 1.0054380893707275, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 3500 + }, + { + "epoch": 3.8423645320197046, + "grad_norm": 1.1224009990692139, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 3510 + }, + { + "epoch": 3.853311439518336, + "grad_norm": 0.8997692465782166, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 3520 + }, + { + "epoch": 3.8642583470169676, + "grad_norm": 1.0086902379989624, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 3530 + }, + { + "epoch": 3.8752052545155995, + "grad_norm": 0.772739589214325, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 3540 + }, + { + "epoch": 3.886152162014231, + "grad_norm": 1.211774230003357, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 3550 + }, + { + "epoch": 3.897099069512863, + "grad_norm": 0.9572356939315796, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 3560 + }, + { + "epoch": 3.9080459770114944, + "grad_norm": 0.7887842655181885, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 3570 + }, + { + "epoch": 3.918992884510126, + "grad_norm": 0.7308389544487, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 3580 + }, + { + "epoch": 3.9299397920087573, + "grad_norm": 1.0182650089263916, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 3590 + }, + { + "epoch": 3.9408866995073892, + "grad_norm": 0.8000147342681885, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 3600 + }, + { + "epoch": 3.9518336070060207, + "grad_norm": 0.7385728359222412, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 3610 + }, + { + "epoch": 3.9627805145046526, + "grad_norm": 0.9233261942863464, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 3620 + }, + { + "epoch": 3.973727422003284, + "grad_norm": 0.8486751914024353, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 3630 + }, + { + "epoch": 3.9846743295019156, + "grad_norm": 0.7593663334846497, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3640 + }, + { + "epoch": 3.9956212370005475, + "grad_norm": 0.7885415554046631, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 3650 + }, + { + "epoch": 4.0, + "eval_loss": 1.250312328338623, + "eval_runtime": 46.0842, + "eval_samples_per_second": 9.461, + "eval_steps_per_second": 1.193, + "step": 3654 + }, + { + "epoch": 4.006568144499179, + "grad_norm": 0.6591703295707703, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 3660 + }, + { + "epoch": 4.017515051997811, + "grad_norm": 1.36927330493927, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 3670 + }, + { + "epoch": 4.028461959496442, + "grad_norm": 0.8106328845024109, + "learning_rate": 0.0002, + "loss": 0.4466, + "step": 3680 + }, + { + "epoch": 4.039408866995074, + "grad_norm": 0.7592712044715881, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 3690 + }, + { + "epoch": 4.050355774493705, + "grad_norm": 0.9518909454345703, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 3700 + }, + { + "epoch": 4.061302681992337, + "grad_norm": 0.7805967330932617, + "learning_rate": 0.0002, + "loss": 0.4638, + "step": 3710 + }, + { + "epoch": 4.072249589490969, + "grad_norm": 1.3146334886550903, + "learning_rate": 0.0002, + "loss": 0.4556, + "step": 3720 + }, + { + "epoch": 4.083196496989601, + "grad_norm": 1.1611138582229614, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 3730 + }, + { + "epoch": 4.094143404488232, + "grad_norm": 0.8173232078552246, + "learning_rate": 0.0002, + "loss": 0.3845, + "step": 3740 + }, + { + "epoch": 4.105090311986864, + "grad_norm": 0.7848323583602905, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 3750 + }, + { + "epoch": 4.116037219485495, + "grad_norm": 1.3183201551437378, + "learning_rate": 0.0002, + "loss": 0.4519, + "step": 3760 + }, + { + "epoch": 4.1269841269841265, + "grad_norm": 1.1936529874801636, + "learning_rate": 0.0002, + "loss": 0.5083, + "step": 3770 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 1.1078993082046509, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 3780 + }, + { + "epoch": 4.14887794198139, + "grad_norm": 1.107743263244629, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 3790 + }, + { + "epoch": 4.159824849480022, + "grad_norm": 0.7801875472068787, + "learning_rate": 0.0002, + "loss": 0.5112, + "step": 3800 + }, + { + "epoch": 4.170771756978653, + "grad_norm": 1.1328117847442627, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 3810 + }, + { + "epoch": 4.181718664477285, + "grad_norm": 1.4232193231582642, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 3820 + }, + { + "epoch": 4.192665571975917, + "grad_norm": 1.557416558265686, + "learning_rate": 0.0002, + "loss": 0.5049, + "step": 3830 + }, + { + "epoch": 4.203612479474549, + "grad_norm": 1.042923092842102, + "learning_rate": 0.0002, + "loss": 0.4863, + "step": 3840 + }, + { + "epoch": 4.21455938697318, + "grad_norm": 1.1801949739456177, + "learning_rate": 0.0002, + "loss": 0.3751, + "step": 3850 + }, + { + "epoch": 4.225506294471812, + "grad_norm": 0.9273753762245178, + "learning_rate": 0.0002, + "loss": 0.5063, + "step": 3860 + }, + { + "epoch": 4.236453201970443, + "grad_norm": 0.7681763768196106, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 3870 + }, + { + "epoch": 4.2474001094690745, + "grad_norm": 0.9840841293334961, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 3880 + }, + { + "epoch": 4.258347016967707, + "grad_norm": 1.0290725231170654, + "learning_rate": 0.0002, + "loss": 0.4648, + "step": 3890 + }, + { + "epoch": 4.269293924466338, + "grad_norm": 0.8059597611427307, + "learning_rate": 0.0002, + "loss": 0.4288, + "step": 3900 + }, + { + "epoch": 4.28024083196497, + "grad_norm": 0.9847467541694641, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 3910 + }, + { + "epoch": 4.291187739463601, + "grad_norm": 1.344044804573059, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 3920 + }, + { + "epoch": 4.302134646962233, + "grad_norm": 0.9174224138259888, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 3930 + }, + { + "epoch": 4.313081554460865, + "grad_norm": 1.1199711561203003, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 3940 + }, + { + "epoch": 4.324028461959497, + "grad_norm": 1.0120296478271484, + "learning_rate": 0.0002, + "loss": 0.4641, + "step": 3950 + }, + { + "epoch": 4.334975369458128, + "grad_norm": 1.091811180114746, + "learning_rate": 0.0002, + "loss": 0.4723, + "step": 3960 + }, + { + "epoch": 4.34592227695676, + "grad_norm": 1.0332133769989014, + "learning_rate": 0.0002, + "loss": 0.4627, + "step": 3970 + }, + { + "epoch": 4.356869184455391, + "grad_norm": 1.0785295963287354, + "learning_rate": 0.0002, + "loss": 0.4646, + "step": 3980 + }, + { + "epoch": 4.3678160919540225, + "grad_norm": 1.0506969690322876, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 3990 + }, + { + "epoch": 4.378762999452655, + "grad_norm": 1.047560691833496, + "learning_rate": 0.0002, + "loss": 0.4776, + "step": 4000 + }, + { + "epoch": 4.389709906951286, + "grad_norm": 0.9348800778388977, + "learning_rate": 0.0002, + "loss": 0.4549, + "step": 4010 + }, + { + "epoch": 4.400656814449918, + "grad_norm": 1.1563059091567993, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 4020 + }, + { + "epoch": 4.411603721948549, + "grad_norm": 1.001470923423767, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 4030 + }, + { + "epoch": 4.422550629447181, + "grad_norm": 1.309012532234192, + "learning_rate": 0.0002, + "loss": 0.4972, + "step": 4040 + }, + { + "epoch": 4.433497536945813, + "grad_norm": 0.7338925004005432, + "learning_rate": 0.0002, + "loss": 0.5078, + "step": 4050 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 1.0398834943771362, + "learning_rate": 0.0002, + "loss": 0.4632, + "step": 4060 + }, + { + "epoch": 4.455391351943076, + "grad_norm": 0.9728689193725586, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 4070 + }, + { + "epoch": 4.466338259441708, + "grad_norm": 1.247475028038025, + "learning_rate": 0.0002, + "loss": 0.4778, + "step": 4080 + }, + { + "epoch": 4.477285166940339, + "grad_norm": 1.1084578037261963, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 4090 + }, + { + "epoch": 4.4882320744389705, + "grad_norm": 1.1619318723678589, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 4100 + }, + { + "epoch": 4.499178981937603, + "grad_norm": 1.3456498384475708, + "learning_rate": 0.0002, + "loss": 0.5207, + "step": 4110 + }, + { + "epoch": 4.510125889436234, + "grad_norm": 0.9372991323471069, + "learning_rate": 0.0002, + "loss": 0.4876, + "step": 4120 + }, + { + "epoch": 4.521072796934866, + "grad_norm": 1.0071815252304077, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 4130 + }, + { + "epoch": 4.532019704433497, + "grad_norm": 1.190344214439392, + "learning_rate": 0.0002, + "loss": 0.5589, + "step": 4140 + }, + { + "epoch": 4.542966611932129, + "grad_norm": 0.9480887055397034, + "learning_rate": 0.0002, + "loss": 0.4852, + "step": 4150 + }, + { + "epoch": 4.553913519430761, + "grad_norm": 1.0252189636230469, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 4160 + }, + { + "epoch": 4.564860426929393, + "grad_norm": 0.7142013311386108, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 4170 + }, + { + "epoch": 4.575807334428024, + "grad_norm": 0.8937426805496216, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 4180 + }, + { + "epoch": 4.586754241926656, + "grad_norm": 0.8885005116462708, + "learning_rate": 0.0002, + "loss": 0.4773, + "step": 4190 + }, + { + "epoch": 4.597701149425287, + "grad_norm": 1.337663173675537, + "learning_rate": 0.0002, + "loss": 0.4858, + "step": 4200 + }, + { + "epoch": 4.6086480569239185, + "grad_norm": 1.0475375652313232, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 4210 + }, + { + "epoch": 4.619594964422551, + "grad_norm": 1.0081088542938232, + "learning_rate": 0.0002, + "loss": 0.5298, + "step": 4220 + }, + { + "epoch": 4.630541871921182, + "grad_norm": 0.7527595162391663, + "learning_rate": 0.0002, + "loss": 0.5042, + "step": 4230 + }, + { + "epoch": 4.641488779419814, + "grad_norm": 1.55559241771698, + "learning_rate": 0.0002, + "loss": 0.5207, + "step": 4240 + }, + { + "epoch": 4.652435686918445, + "grad_norm": 0.7967379689216614, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 4250 + }, + { + "epoch": 4.663382594417077, + "grad_norm": 0.898368775844574, + "learning_rate": 0.0002, + "loss": 0.5328, + "step": 4260 + }, + { + "epoch": 4.674329501915709, + "grad_norm": 1.1940776109695435, + "learning_rate": 0.0002, + "loss": 0.4706, + "step": 4270 + }, + { + "epoch": 4.685276409414341, + "grad_norm": 1.1817092895507812, + "learning_rate": 0.0002, + "loss": 0.5121, + "step": 4280 + }, + { + "epoch": 4.696223316912972, + "grad_norm": 0.9041520357131958, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 4290 + }, + { + "epoch": 4.707170224411604, + "grad_norm": 1.1280102729797363, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 4300 + }, + { + "epoch": 4.718117131910235, + "grad_norm": 1.357689619064331, + "learning_rate": 0.0002, + "loss": 0.4891, + "step": 4310 + }, + { + "epoch": 4.7290640394088665, + "grad_norm": 1.056633472442627, + "learning_rate": 0.0002, + "loss": 0.4704, + "step": 4320 + }, + { + "epoch": 4.740010946907499, + "grad_norm": 1.6520427465438843, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 4330 + }, + { + "epoch": 4.75095785440613, + "grad_norm": 1.153200626373291, + "learning_rate": 0.0002, + "loss": 0.5131, + "step": 4340 + }, + { + "epoch": 4.761904761904762, + "grad_norm": 0.9346241354942322, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 4350 + }, + { + "epoch": 4.772851669403393, + "grad_norm": 0.8628455996513367, + "learning_rate": 0.0002, + "loss": 0.4941, + "step": 4360 + }, + { + "epoch": 4.783798576902025, + "grad_norm": 1.3843916654586792, + "learning_rate": 0.0002, + "loss": 0.5167, + "step": 4370 + }, + { + "epoch": 4.794745484400657, + "grad_norm": 1.035574197769165, + "learning_rate": 0.0002, + "loss": 0.4683, + "step": 4380 + }, + { + "epoch": 4.805692391899289, + "grad_norm": 1.1868361234664917, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 4390 + }, + { + "epoch": 4.81663929939792, + "grad_norm": 1.1307647228240967, + "learning_rate": 0.0002, + "loss": 0.534, + "step": 4400 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 0.9787724614143372, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 4410 + }, + { + "epoch": 4.838533114395183, + "grad_norm": 1.0473824739456177, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 4420 + }, + { + "epoch": 4.8494800218938146, + "grad_norm": 1.069069504737854, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 4430 + }, + { + "epoch": 4.860426929392447, + "grad_norm": 1.4305680990219116, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 4440 + }, + { + "epoch": 4.871373836891078, + "grad_norm": 1.3679203987121582, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 4450 + }, + { + "epoch": 4.88232074438971, + "grad_norm": 0.8997844457626343, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 4460 + }, + { + "epoch": 4.893267651888341, + "grad_norm": 1.2758110761642456, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 4470 + }, + { + "epoch": 4.904214559386973, + "grad_norm": 0.8819465637207031, + "learning_rate": 0.0002, + "loss": 0.4914, + "step": 4480 + }, + { + "epoch": 4.915161466885605, + "grad_norm": 1.08329439163208, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 4490 + }, + { + "epoch": 4.926108374384237, + "grad_norm": 1.083461046218872, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 4500 + }, + { + "epoch": 4.937055281882868, + "grad_norm": 1.2387723922729492, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 4510 + }, + { + "epoch": 4.9480021893815, + "grad_norm": 0.8262293934822083, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 4520 + }, + { + "epoch": 4.958949096880131, + "grad_norm": 1.2325191497802734, + "learning_rate": 0.0002, + "loss": 0.504, + "step": 4530 + }, + { + "epoch": 4.9698960043787626, + "grad_norm": 1.024614930152893, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 4540 + }, + { + "epoch": 4.980842911877395, + "grad_norm": 1.3007521629333496, + "learning_rate": 0.0002, + "loss": 0.4752, + "step": 4550 + }, + { + "epoch": 4.991789819376026, + "grad_norm": 0.9823828339576721, + "learning_rate": 0.0002, + "loss": 0.4943, + "step": 4560 + }, + { + "epoch": 4.999452654625069, + "eval_loss": 1.3920727968215942, + "eval_runtime": 46.0764, + "eval_samples_per_second": 9.463, + "eval_steps_per_second": 1.194, + "step": 4567 + } + ], + "logging_steps": 10, + "max_steps": 7304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.346039081566208e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..895406585c61ed7f5a6760976fc06a40a6fd8b3b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-4567/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a158bca77bc6d58bfffd2acd2e9e641d481e1857fcbdd68fee875b74797abbc4 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..82c133a73dac49fd8b22dae8cd0105dd3e834f26 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:813c10c32a1db7b0343c09b57fe6ff198f4d9c5e705f4c0fa06a932d007912ae +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fe267b14fe3a5b73d54b2a70f216c38551da26b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b546deea7f03b6244165ccd337e00769d9bf8ea1b1e7828006bbffc59607b9a +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ab79c8b7dfce179b8a9c1ac2327a60afed32b6eb --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b15c5d80fbe79a987a0ec29b50d9d5373326b9de9985c638f7efcbd9ed4177ec +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c4e019fb2c3b15fc765ed001d4d756a29a9f82d --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9d21638e7e3836177ef804d71bacd5b91faa871fce0cf0ea4b7895c8599e125 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..996adc3e8b127f52fea698ba5b1a4699f222fd50 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/trainer_state.json @@ -0,0 +1,3917 @@ +{ + "best_metric": 1.14472496509552, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 5481, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.010946907498631636, + "grad_norm": 0.7611560821533203, + "learning_rate": 0.0002, + "loss": 2.4039, + "step": 10 + }, + { + "epoch": 0.021893814997263273, + "grad_norm": 0.4633193612098694, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 20 + }, + { + "epoch": 0.03284072249589491, + "grad_norm": 0.49326154589653015, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 30 + }, + { + "epoch": 0.043787629994526546, + "grad_norm": 0.3943138122558594, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 40 + }, + { + "epoch": 0.05473453749315818, + "grad_norm": 0.43292930722236633, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 50 + }, + { + "epoch": 0.06568144499178982, + "grad_norm": 0.3431817591190338, + "learning_rate": 0.0002, + "loss": 1.2427, + "step": 60 + }, + { + "epoch": 0.07662835249042145, + "grad_norm": 0.38011446595191956, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 70 + }, + { + "epoch": 0.08757525998905309, + "grad_norm": 0.7441071271896362, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 80 + }, + { + "epoch": 0.09852216748768473, + "grad_norm": 0.3359833061695099, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 90 + }, + { + "epoch": 0.10946907498631636, + "grad_norm": 0.3724392354488373, + "learning_rate": 0.0002, + "loss": 1.2774, + "step": 100 + }, + { + "epoch": 0.120415982484948, + "grad_norm": 0.40673762559890747, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 110 + }, + { + "epoch": 0.13136288998357964, + "grad_norm": 0.40036800503730774, + "learning_rate": 0.0002, + "loss": 1.2611, + "step": 120 + }, + { + "epoch": 0.1423097974822113, + "grad_norm": 2.844191312789917, + "learning_rate": 0.0002, + "loss": 1.2436, + "step": 130 + }, + { + "epoch": 0.1532567049808429, + "grad_norm": 0.3104734420776367, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 140 + }, + { + "epoch": 0.16420361247947454, + "grad_norm": 0.3266797959804535, + "learning_rate": 0.0002, + "loss": 1.0498, + "step": 150 + }, + { + "epoch": 0.17515051997810618, + "grad_norm": 0.3079199194908142, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 160 + }, + { + "epoch": 0.18609742747673783, + "grad_norm": 0.3872479498386383, + "learning_rate": 0.0002, + "loss": 1.2923, + "step": 170 + }, + { + "epoch": 0.19704433497536947, + "grad_norm": 0.38654500246047974, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 180 + }, + { + "epoch": 0.20799124247400108, + "grad_norm": 0.2913552522659302, + "learning_rate": 0.0002, + "loss": 1.066, + "step": 190 + }, + { + "epoch": 0.21893814997263272, + "grad_norm": 0.2960572838783264, + "learning_rate": 0.0002, + "loss": 1.0868, + "step": 200 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 0.5175501108169556, + "learning_rate": 0.0002, + "loss": 1.136, + "step": 210 + }, + { + "epoch": 0.240831964969896, + "grad_norm": 1.2921574115753174, + "learning_rate": 0.0002, + "loss": 1.1556, + "step": 220 + }, + { + "epoch": 0.25177887246852765, + "grad_norm": 0.2675512135028839, + "learning_rate": 0.0002, + "loss": 1.1488, + "step": 230 + }, + { + "epoch": 0.2627257799671593, + "grad_norm": 0.3956190049648285, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 240 + }, + { + "epoch": 0.27367268746579093, + "grad_norm": 0.6022581458091736, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 250 + }, + { + "epoch": 0.2846195949644226, + "grad_norm": 1.1949563026428223, + "learning_rate": 0.0002, + "loss": 1.1981, + "step": 260 + }, + { + "epoch": 0.2955665024630542, + "grad_norm": 0.31173548102378845, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 270 + }, + { + "epoch": 0.3065134099616858, + "grad_norm": 0.2808472812175751, + "learning_rate": 0.0002, + "loss": 1.06, + "step": 280 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 0.3042023777961731, + "learning_rate": 0.0002, + "loss": 1.1752, + "step": 290 + }, + { + "epoch": 0.3284072249589491, + "grad_norm": 0.39915043115615845, + "learning_rate": 0.0002, + "loss": 1.3147, + "step": 300 + }, + { + "epoch": 0.3393541324575807, + "grad_norm": 0.39118197560310364, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 310 + }, + { + "epoch": 0.35030103995621237, + "grad_norm": 0.355010986328125, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 320 + }, + { + "epoch": 0.361247947454844, + "grad_norm": 0.29734086990356445, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 330 + }, + { + "epoch": 0.37219485495347565, + "grad_norm": 0.346096009016037, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 340 + }, + { + "epoch": 0.3831417624521073, + "grad_norm": 0.4829643666744232, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 350 + }, + { + "epoch": 0.39408866995073893, + "grad_norm": 0.4726872742176056, + "learning_rate": 0.0002, + "loss": 1.2808, + "step": 360 + }, + { + "epoch": 0.4050355774493706, + "grad_norm": 0.3130153715610504, + "learning_rate": 0.0002, + "loss": 1.1532, + "step": 370 + }, + { + "epoch": 0.41598248494800216, + "grad_norm": 0.5123590230941772, + "learning_rate": 0.0002, + "loss": 1.1842, + "step": 380 + }, + { + "epoch": 0.4269293924466338, + "grad_norm": 0.3444574773311615, + "learning_rate": 0.0002, + "loss": 1.1539, + "step": 390 + }, + { + "epoch": 0.43787629994526545, + "grad_norm": 0.5302175283432007, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 400 + }, + { + "epoch": 0.4488232074438971, + "grad_norm": 0.2713572680950165, + "learning_rate": 0.0002, + "loss": 1.1138, + "step": 410 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 0.29530611634254456, + "learning_rate": 0.0002, + "loss": 1.1281, + "step": 420 + }, + { + "epoch": 0.47071702244116037, + "grad_norm": 0.27282455563545227, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 430 + }, + { + "epoch": 0.481663929939792, + "grad_norm": 0.2647949755191803, + "learning_rate": 0.0002, + "loss": 1.1213, + "step": 440 + }, + { + "epoch": 0.49261083743842365, + "grad_norm": 0.35509347915649414, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 450 + }, + { + "epoch": 0.5035577449370553, + "grad_norm": 0.1959609091281891, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 460 + }, + { + "epoch": 0.5145046524356869, + "grad_norm": 0.40090155601501465, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 470 + }, + { + "epoch": 0.5254515599343186, + "grad_norm": 0.3354604244232178, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 480 + }, + { + "epoch": 0.5363984674329502, + "grad_norm": 0.2758506238460541, + "learning_rate": 0.0002, + "loss": 1.1024, + "step": 490 + }, + { + "epoch": 0.5473453749315819, + "grad_norm": 0.3629051744937897, + "learning_rate": 0.0002, + "loss": 1.1108, + "step": 500 + }, + { + "epoch": 0.5582922824302134, + "grad_norm": 0.30802229046821594, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 510 + }, + { + "epoch": 0.5692391899288451, + "grad_norm": 0.3099463880062103, + "learning_rate": 0.0002, + "loss": 1.0424, + "step": 520 + }, + { + "epoch": 0.5801860974274767, + "grad_norm": 0.42299067974090576, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 530 + }, + { + "epoch": 0.5911330049261084, + "grad_norm": 0.5392252802848816, + "learning_rate": 0.0002, + "loss": 1.1698, + "step": 540 + }, + { + "epoch": 0.60207991242474, + "grad_norm": 0.34768250584602356, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 550 + }, + { + "epoch": 0.6130268199233716, + "grad_norm": 0.28490015864372253, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 560 + }, + { + "epoch": 0.6239737274220033, + "grad_norm": 0.34787994623184204, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 570 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.29058772325515747, + "learning_rate": 0.0002, + "loss": 1.0759, + "step": 580 + }, + { + "epoch": 0.6458675424192666, + "grad_norm": 0.4063778817653656, + "learning_rate": 0.0002, + "loss": 1.1157, + "step": 590 + }, + { + "epoch": 0.6568144499178982, + "grad_norm": 0.9244267344474792, + "learning_rate": 0.0002, + "loss": 1.1432, + "step": 600 + }, + { + "epoch": 0.6677613574165299, + "grad_norm": 0.27605190873146057, + "learning_rate": 0.0002, + "loss": 1.0591, + "step": 610 + }, + { + "epoch": 0.6787082649151615, + "grad_norm": 0.34346821904182434, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 620 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.3093279302120209, + "learning_rate": 0.0002, + "loss": 1.2195, + "step": 630 + }, + { + "epoch": 0.7006020799124247, + "grad_norm": 1.0069009065628052, + "learning_rate": 0.0002, + "loss": 1.2461, + "step": 640 + }, + { + "epoch": 0.7115489874110563, + "grad_norm": 0.5049130916595459, + "learning_rate": 0.0002, + "loss": 1.0892, + "step": 650 + }, + { + "epoch": 0.722495894909688, + "grad_norm": 0.3748924732208252, + "learning_rate": 0.0002, + "loss": 1.1404, + "step": 660 + }, + { + "epoch": 0.7334428024083196, + "grad_norm": 0.2964959144592285, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 670 + }, + { + "epoch": 0.7443897099069513, + "grad_norm": 0.4599986970424652, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 680 + }, + { + "epoch": 0.7553366174055829, + "grad_norm": 0.27292951941490173, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 690 + }, + { + "epoch": 0.7662835249042146, + "grad_norm": 0.3123566806316376, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 700 + }, + { + "epoch": 0.7772304324028462, + "grad_norm": 0.28310710191726685, + "learning_rate": 0.0002, + "loss": 1.0021, + "step": 710 + }, + { + "epoch": 0.7881773399014779, + "grad_norm": 0.3279992341995239, + "learning_rate": 0.0002, + "loss": 1.2281, + "step": 720 + }, + { + "epoch": 0.7991242474001095, + "grad_norm": 0.28179168701171875, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 730 + }, + { + "epoch": 0.8100711548987412, + "grad_norm": 0.31492987275123596, + "learning_rate": 0.0002, + "loss": 1.0602, + "step": 740 + }, + { + "epoch": 0.8210180623973727, + "grad_norm": 0.41821011900901794, + "learning_rate": 0.0002, + "loss": 1.2518, + "step": 750 + }, + { + "epoch": 0.8319649698960043, + "grad_norm": 0.325235515832901, + "learning_rate": 0.0002, + "loss": 1.1612, + "step": 760 + }, + { + "epoch": 0.842911877394636, + "grad_norm": 0.5366070866584778, + "learning_rate": 0.0002, + "loss": 1.27, + "step": 770 + }, + { + "epoch": 0.8538587848932676, + "grad_norm": 0.32570579648017883, + "learning_rate": 0.0002, + "loss": 1.0921, + "step": 780 + }, + { + "epoch": 0.8648056923918993, + "grad_norm": 0.3642968237400055, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 790 + }, + { + "epoch": 0.8757525998905309, + "grad_norm": 0.29713448882102966, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 800 + }, + { + "epoch": 0.8866995073891626, + "grad_norm": 0.23599444329738617, + "learning_rate": 0.0002, + "loss": 1.0978, + "step": 810 + }, + { + "epoch": 0.8976464148877942, + "grad_norm": 0.31522464752197266, + "learning_rate": 0.0002, + "loss": 1.1867, + "step": 820 + }, + { + "epoch": 0.9085933223864259, + "grad_norm": 0.32754790782928467, + "learning_rate": 0.0002, + "loss": 1.0208, + "step": 830 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 0.22741089761257172, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 840 + }, + { + "epoch": 0.9304871373836892, + "grad_norm": 0.3089679777622223, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 850 + }, + { + "epoch": 0.9414340448823207, + "grad_norm": 0.27440521121025085, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 860 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.3498363792896271, + "learning_rate": 0.0002, + "loss": 1.0417, + "step": 870 + }, + { + "epoch": 0.963327859879584, + "grad_norm": 0.47151854634284973, + "learning_rate": 0.0002, + "loss": 1.269, + "step": 880 + }, + { + "epoch": 0.9742747673782156, + "grad_norm": 0.24756591022014618, + "learning_rate": 0.0002, + "loss": 1.1174, + "step": 890 + }, + { + "epoch": 0.9852216748768473, + "grad_norm": 0.2600938677787781, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 900 + }, + { + "epoch": 0.9961685823754789, + "grad_norm": 0.2934586703777313, + "learning_rate": 0.0002, + "loss": 1.0968, + "step": 910 + }, + { + "epoch": 0.9994526546250684, + "eval_loss": 1.158464789390564, + "eval_runtime": 46.0774, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 1.194, + "step": 913 + }, + { + "epoch": 1.0071154898741106, + "grad_norm": 0.37776654958724976, + "learning_rate": 0.0002, + "loss": 1.0322, + "step": 920 + }, + { + "epoch": 1.0180623973727423, + "grad_norm": 0.31784629821777344, + "learning_rate": 0.0002, + "loss": 1.1346, + "step": 930 + }, + { + "epoch": 1.0290093048713738, + "grad_norm": 0.24244336783885956, + "learning_rate": 0.0002, + "loss": 0.8976, + "step": 940 + }, + { + "epoch": 1.0399562123700055, + "grad_norm": 0.3185454308986664, + "learning_rate": 0.0002, + "loss": 1.0172, + "step": 950 + }, + { + "epoch": 1.0509031198686372, + "grad_norm": 0.3589441478252411, + "learning_rate": 0.0002, + "loss": 1.1645, + "step": 960 + }, + { + "epoch": 1.0618500273672686, + "grad_norm": 0.38593578338623047, + "learning_rate": 0.0002, + "loss": 1.0317, + "step": 970 + }, + { + "epoch": 1.0727969348659003, + "grad_norm": 0.39694955945014954, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 980 + }, + { + "epoch": 1.083743842364532, + "grad_norm": 0.469817191362381, + "learning_rate": 0.0002, + "loss": 1.0893, + "step": 990 + }, + { + "epoch": 1.0946907498631637, + "grad_norm": 0.2634755074977875, + "learning_rate": 0.0002, + "loss": 0.983, + "step": 1000 + }, + { + "epoch": 1.1056376573617952, + "grad_norm": 0.43189436197280884, + "learning_rate": 0.0002, + "loss": 1.0144, + "step": 1010 + }, + { + "epoch": 1.116584564860427, + "grad_norm": 0.5559977889060974, + "learning_rate": 0.0002, + "loss": 0.9663, + "step": 1020 + }, + { + "epoch": 1.1275314723590586, + "grad_norm": 0.32100191712379456, + "learning_rate": 0.0002, + "loss": 1.0481, + "step": 1030 + }, + { + "epoch": 1.1384783798576903, + "grad_norm": 0.40179768204689026, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 1040 + }, + { + "epoch": 1.1494252873563218, + "grad_norm": 0.3659493029117584, + "learning_rate": 0.0002, + "loss": 1.0029, + "step": 1050 + }, + { + "epoch": 1.1603721948549535, + "grad_norm": 0.701704204082489, + "learning_rate": 0.0002, + "loss": 0.9597, + "step": 1060 + }, + { + "epoch": 1.1713191023535852, + "grad_norm": 0.3650563359260559, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 1070 + }, + { + "epoch": 1.1822660098522166, + "grad_norm": 0.3191976249217987, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 1080 + }, + { + "epoch": 1.1932129173508483, + "grad_norm": 0.3615441918373108, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 1090 + }, + { + "epoch": 1.20415982484948, + "grad_norm": 0.39474231004714966, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 1100 + }, + { + "epoch": 1.2151067323481117, + "grad_norm": 0.3752822279930115, + "learning_rate": 0.0002, + "loss": 0.9852, + "step": 1110 + }, + { + "epoch": 1.2260536398467432, + "grad_norm": 0.4165991246700287, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 1120 + }, + { + "epoch": 1.237000547345375, + "grad_norm": 0.5326506495475769, + "learning_rate": 0.0002, + "loss": 1.0244, + "step": 1130 + }, + { + "epoch": 1.2479474548440066, + "grad_norm": 0.48845794796943665, + "learning_rate": 0.0002, + "loss": 1.0542, + "step": 1140 + }, + { + "epoch": 1.2588943623426383, + "grad_norm": 0.29910150170326233, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 1150 + }, + { + "epoch": 1.2698412698412698, + "grad_norm": 0.5069725513458252, + "learning_rate": 0.0002, + "loss": 1.2233, + "step": 1160 + }, + { + "epoch": 1.2807881773399015, + "grad_norm": 0.29500406980514526, + "learning_rate": 0.0002, + "loss": 1.0992, + "step": 1170 + }, + { + "epoch": 1.2917350848385332, + "grad_norm": 0.4711538851261139, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 1180 + }, + { + "epoch": 1.3026819923371646, + "grad_norm": 0.4203340709209442, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 1190 + }, + { + "epoch": 1.3136288998357963, + "grad_norm": 0.36101874709129333, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 1200 + }, + { + "epoch": 1.324575807334428, + "grad_norm": 0.4608800411224365, + "learning_rate": 0.0002, + "loss": 1.085, + "step": 1210 + }, + { + "epoch": 1.3355227148330597, + "grad_norm": 0.6570906639099121, + "learning_rate": 0.0002, + "loss": 1.0695, + "step": 1220 + }, + { + "epoch": 1.3464696223316914, + "grad_norm": 0.5352164506912231, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 1230 + }, + { + "epoch": 1.357416529830323, + "grad_norm": 0.3885001242160797, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 1240 + }, + { + "epoch": 1.3683634373289546, + "grad_norm": 0.2987913489341736, + "learning_rate": 0.0002, + "loss": 1.0611, + "step": 1250 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.42070427536964417, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 1260 + }, + { + "epoch": 1.3902572523262178, + "grad_norm": 0.5957782864570618, + "learning_rate": 0.0002, + "loss": 1.1092, + "step": 1270 + }, + { + "epoch": 1.4012041598248495, + "grad_norm": 0.32898882031440735, + "learning_rate": 0.0002, + "loss": 1.0554, + "step": 1280 + }, + { + "epoch": 1.4121510673234812, + "grad_norm": 0.27624452114105225, + "learning_rate": 0.0002, + "loss": 0.909, + "step": 1290 + }, + { + "epoch": 1.4230979748221126, + "grad_norm": 0.49570828676223755, + "learning_rate": 0.0002, + "loss": 1.024, + "step": 1300 + }, + { + "epoch": 1.4340448823207443, + "grad_norm": 0.26191383600234985, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 1310 + }, + { + "epoch": 1.444991789819376, + "grad_norm": 0.35664042830467224, + "learning_rate": 0.0002, + "loss": 1.0788, + "step": 1320 + }, + { + "epoch": 1.4559386973180077, + "grad_norm": 0.45126354694366455, + "learning_rate": 0.0002, + "loss": 1.0108, + "step": 1330 + }, + { + "epoch": 1.4668856048166394, + "grad_norm": 0.37318357825279236, + "learning_rate": 0.0002, + "loss": 0.9571, + "step": 1340 + }, + { + "epoch": 1.477832512315271, + "grad_norm": 0.6428970098495483, + "learning_rate": 0.0002, + "loss": 1.0507, + "step": 1350 + }, + { + "epoch": 1.4887794198139026, + "grad_norm": 0.43256187438964844, + "learning_rate": 0.0002, + "loss": 1.0367, + "step": 1360 + }, + { + "epoch": 1.4997263273125343, + "grad_norm": 0.5343793630599976, + "learning_rate": 0.0002, + "loss": 1.1321, + "step": 1370 + }, + { + "epoch": 1.5106732348111658, + "grad_norm": 0.315437376499176, + "learning_rate": 0.0002, + "loss": 1.1054, + "step": 1380 + }, + { + "epoch": 1.5216201423097975, + "grad_norm": 0.41561153531074524, + "learning_rate": 0.0002, + "loss": 0.8916, + "step": 1390 + }, + { + "epoch": 1.5325670498084292, + "grad_norm": 0.3201070725917816, + "learning_rate": 0.0002, + "loss": 1.0391, + "step": 1400 + }, + { + "epoch": 1.5435139573070606, + "grad_norm": 0.505537211894989, + "learning_rate": 0.0002, + "loss": 1.14, + "step": 1410 + }, + { + "epoch": 1.5544608648056923, + "grad_norm": 0.3747410178184509, + "learning_rate": 0.0002, + "loss": 1.0775, + "step": 1420 + }, + { + "epoch": 1.565407772304324, + "grad_norm": 0.49385908246040344, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 1430 + }, + { + "epoch": 1.5763546798029555, + "grad_norm": 0.49831628799438477, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 1440 + }, + { + "epoch": 1.5873015873015874, + "grad_norm": 0.372127890586853, + "learning_rate": 0.0002, + "loss": 1.0079, + "step": 1450 + }, + { + "epoch": 1.598248494800219, + "grad_norm": 0.40070840716362, + "learning_rate": 0.0002, + "loss": 0.9931, + "step": 1460 + }, + { + "epoch": 1.6091954022988506, + "grad_norm": 0.34907400608062744, + "learning_rate": 0.0002, + "loss": 0.8954, + "step": 1470 + }, + { + "epoch": 1.6201423097974823, + "grad_norm": 0.4632418751716614, + "learning_rate": 0.0002, + "loss": 0.9743, + "step": 1480 + }, + { + "epoch": 1.6310892172961138, + "grad_norm": 0.40164515376091003, + "learning_rate": 0.0002, + "loss": 1.0103, + "step": 1490 + }, + { + "epoch": 1.6420361247947455, + "grad_norm": 0.3214994966983795, + "learning_rate": 0.0002, + "loss": 0.9523, + "step": 1500 + }, + { + "epoch": 1.6529830322933772, + "grad_norm": 0.3727897107601166, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 1510 + }, + { + "epoch": 1.6639299397920086, + "grad_norm": 0.3817640542984009, + "learning_rate": 0.0002, + "loss": 1.0443, + "step": 1520 + }, + { + "epoch": 1.6748768472906403, + "grad_norm": 0.5592136979103088, + "learning_rate": 0.0002, + "loss": 1.0511, + "step": 1530 + }, + { + "epoch": 1.685823754789272, + "grad_norm": 0.44636598229408264, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 1540 + }, + { + "epoch": 1.6967706622879035, + "grad_norm": 0.40441709756851196, + "learning_rate": 0.0002, + "loss": 1.033, + "step": 1550 + }, + { + "epoch": 1.7077175697865354, + "grad_norm": 0.3243522644042969, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 1560 + }, + { + "epoch": 1.718664477285167, + "grad_norm": 0.34277570247650146, + "learning_rate": 0.0002, + "loss": 0.9482, + "step": 1570 + }, + { + "epoch": 1.7296113847837986, + "grad_norm": 0.3279995024204254, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 1580 + }, + { + "epoch": 1.7405582922824303, + "grad_norm": 0.41968777775764465, + "learning_rate": 0.0002, + "loss": 0.9414, + "step": 1590 + }, + { + "epoch": 1.7515051997810618, + "grad_norm": 0.39464613795280457, + "learning_rate": 0.0002, + "loss": 0.9768, + "step": 1600 + }, + { + "epoch": 1.7624521072796935, + "grad_norm": 0.3839009404182434, + "learning_rate": 0.0002, + "loss": 1.0347, + "step": 1610 + }, + { + "epoch": 1.7733990147783252, + "grad_norm": 0.3250715434551239, + "learning_rate": 0.0002, + "loss": 0.9195, + "step": 1620 + }, + { + "epoch": 1.7843459222769567, + "grad_norm": 0.5166561007499695, + "learning_rate": 0.0002, + "loss": 1.0049, + "step": 1630 + }, + { + "epoch": 1.7952928297755884, + "grad_norm": 0.4115183353424072, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 1640 + }, + { + "epoch": 1.80623973727422, + "grad_norm": 0.373780220746994, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 1650 + }, + { + "epoch": 1.8171866447728515, + "grad_norm": 0.49697014689445496, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 1660 + }, + { + "epoch": 1.8281335522714834, + "grad_norm": 1.0308938026428223, + "learning_rate": 0.0002, + "loss": 1.0089, + "step": 1670 + }, + { + "epoch": 1.839080459770115, + "grad_norm": 0.4851366877555847, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 1680 + }, + { + "epoch": 1.8500273672687466, + "grad_norm": 0.3262481391429901, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 1690 + }, + { + "epoch": 1.8609742747673783, + "grad_norm": 0.6904496550559998, + "learning_rate": 0.0002, + "loss": 1.048, + "step": 1700 + }, + { + "epoch": 1.8719211822660098, + "grad_norm": 0.49789851903915405, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 1710 + }, + { + "epoch": 1.8828680897646415, + "grad_norm": 0.3035794198513031, + "learning_rate": 0.0002, + "loss": 1.0258, + "step": 1720 + }, + { + "epoch": 1.8938149972632732, + "grad_norm": 0.4588414430618286, + "learning_rate": 0.0002, + "loss": 0.9916, + "step": 1730 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.4313034117221832, + "learning_rate": 0.0002, + "loss": 0.9526, + "step": 1740 + }, + { + "epoch": 1.9157088122605364, + "grad_norm": 0.38562044501304626, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 1750 + }, + { + "epoch": 1.926655719759168, + "grad_norm": 0.46947410702705383, + "learning_rate": 0.0002, + "loss": 0.9763, + "step": 1760 + }, + { + "epoch": 1.9376026272577995, + "grad_norm": 0.3848404884338379, + "learning_rate": 0.0002, + "loss": 0.9967, + "step": 1770 + }, + { + "epoch": 1.9485495347564314, + "grad_norm": 0.30422744154930115, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 1780 + }, + { + "epoch": 1.959496442255063, + "grad_norm": 0.41100990772247314, + "learning_rate": 0.0002, + "loss": 1.022, + "step": 1790 + }, + { + "epoch": 1.9704433497536946, + "grad_norm": 0.3492335081100464, + "learning_rate": 0.0002, + "loss": 1.003, + "step": 1800 + }, + { + "epoch": 1.9813902572523263, + "grad_norm": 0.364577978849411, + "learning_rate": 0.0002, + "loss": 0.9891, + "step": 1810 + }, + { + "epoch": 1.9923371647509578, + "grad_norm": 0.4312075674533844, + "learning_rate": 0.0002, + "loss": 1.0699, + "step": 1820 + }, + { + "epoch": 2.0, + "eval_loss": 1.14472496509552, + "eval_runtime": 46.0786, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 1.194, + "step": 1827 + }, + { + "epoch": 2.0032840722495897, + "grad_norm": 0.5989689230918884, + "learning_rate": 0.0002, + "loss": 1.0218, + "step": 1830 + }, + { + "epoch": 2.014230979748221, + "grad_norm": 0.49720922112464905, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 1840 + }, + { + "epoch": 2.0251778872468527, + "grad_norm": 0.42675456404685974, + "learning_rate": 0.0002, + "loss": 0.8482, + "step": 1850 + }, + { + "epoch": 2.0361247947454846, + "grad_norm": 0.4637208580970764, + "learning_rate": 0.0002, + "loss": 0.8487, + "step": 1860 + }, + { + "epoch": 2.047071702244116, + "grad_norm": 0.8329976797103882, + "learning_rate": 0.0002, + "loss": 0.8212, + "step": 1870 + }, + { + "epoch": 2.0580186097427475, + "grad_norm": 0.7869427800178528, + "learning_rate": 0.0002, + "loss": 0.9547, + "step": 1880 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.4927455186843872, + "learning_rate": 0.0002, + "loss": 0.9351, + "step": 1890 + }, + { + "epoch": 2.079912424740011, + "grad_norm": 0.6264246702194214, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 1900 + }, + { + "epoch": 2.0908593322386424, + "grad_norm": 1.1164122819900513, + "learning_rate": 0.0002, + "loss": 0.9095, + "step": 1910 + }, + { + "epoch": 2.1018062397372743, + "grad_norm": 0.5283981561660767, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 1920 + }, + { + "epoch": 2.112753147235906, + "grad_norm": 0.45621731877326965, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 1930 + }, + { + "epoch": 2.1237000547345373, + "grad_norm": 1.381791591644287, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 1940 + }, + { + "epoch": 2.134646962233169, + "grad_norm": 0.5151259899139404, + "learning_rate": 0.0002, + "loss": 0.9006, + "step": 1950 + }, + { + "epoch": 2.1455938697318007, + "grad_norm": 0.9806339740753174, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 1960 + }, + { + "epoch": 2.1565407772304326, + "grad_norm": 0.4734154939651489, + "learning_rate": 0.0002, + "loss": 0.8749, + "step": 1970 + }, + { + "epoch": 2.167487684729064, + "grad_norm": 0.9553168416023254, + "learning_rate": 0.0002, + "loss": 0.9172, + "step": 1980 + }, + { + "epoch": 2.1784345922276955, + "grad_norm": 0.5895838141441345, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1990 + }, + { + "epoch": 2.1893814997263275, + "grad_norm": 0.4488855302333832, + "learning_rate": 0.0002, + "loss": 0.7841, + "step": 2000 + }, + { + "epoch": 2.200328407224959, + "grad_norm": 1.0760235786437988, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 2010 + }, + { + "epoch": 2.2112753147235904, + "grad_norm": 0.5038785338401794, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 2020 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.59819495677948, + "learning_rate": 0.0002, + "loss": 0.8973, + "step": 2030 + }, + { + "epoch": 2.233169129720854, + "grad_norm": 0.5012075304985046, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 2040 + }, + { + "epoch": 2.2441160372194853, + "grad_norm": 0.44978439807891846, + "learning_rate": 0.0002, + "loss": 0.9087, + "step": 2050 + }, + { + "epoch": 2.255062944718117, + "grad_norm": 0.5350462198257446, + "learning_rate": 0.0002, + "loss": 0.823, + "step": 2060 + }, + { + "epoch": 2.2660098522167487, + "grad_norm": 0.6020669937133789, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 2070 + }, + { + "epoch": 2.2769567597153806, + "grad_norm": 0.5246821045875549, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 2080 + }, + { + "epoch": 2.287903667214012, + "grad_norm": 0.5711268782615662, + "learning_rate": 0.0002, + "loss": 0.8984, + "step": 2090 + }, + { + "epoch": 2.2988505747126435, + "grad_norm": 0.617317259311676, + "learning_rate": 0.0002, + "loss": 0.9093, + "step": 2100 + }, + { + "epoch": 2.3097974822112755, + "grad_norm": 0.8608947396278381, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 2110 + }, + { + "epoch": 2.320744389709907, + "grad_norm": 0.4739076793193817, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 2120 + }, + { + "epoch": 2.3316912972085384, + "grad_norm": 0.5538856983184814, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 2130 + }, + { + "epoch": 2.3426382047071703, + "grad_norm": 0.6064935326576233, + "learning_rate": 0.0002, + "loss": 0.8994, + "step": 2140 + }, + { + "epoch": 2.353585112205802, + "grad_norm": 0.5019068121910095, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 2150 + }, + { + "epoch": 2.3645320197044333, + "grad_norm": 0.45340514183044434, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 2160 + }, + { + "epoch": 2.375478927203065, + "grad_norm": 0.7347203493118286, + "learning_rate": 0.0002, + "loss": 0.9254, + "step": 2170 + }, + { + "epoch": 2.3864258347016967, + "grad_norm": 0.46922534704208374, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 2180 + }, + { + "epoch": 2.3973727422003286, + "grad_norm": 0.5507845878601074, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2190 + }, + { + "epoch": 2.40831964969896, + "grad_norm": 0.5621911883354187, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2200 + }, + { + "epoch": 2.4192665571975915, + "grad_norm": 0.5023514032363892, + "learning_rate": 0.0002, + "loss": 0.8569, + "step": 2210 + }, + { + "epoch": 2.4302134646962235, + "grad_norm": 0.6124861240386963, + "learning_rate": 0.0002, + "loss": 0.8355, + "step": 2220 + }, + { + "epoch": 2.441160372194855, + "grad_norm": 0.49614205956459045, + "learning_rate": 0.0002, + "loss": 0.885, + "step": 2230 + }, + { + "epoch": 2.4521072796934864, + "grad_norm": 0.6477900743484497, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 2240 + }, + { + "epoch": 2.4630541871921183, + "grad_norm": 0.5868843793869019, + "learning_rate": 0.0002, + "loss": 0.8622, + "step": 2250 + }, + { + "epoch": 2.47400109469075, + "grad_norm": 0.4364610016345978, + "learning_rate": 0.0002, + "loss": 0.8498, + "step": 2260 + }, + { + "epoch": 2.4849480021893813, + "grad_norm": 0.5792964696884155, + "learning_rate": 0.0002, + "loss": 0.8378, + "step": 2270 + }, + { + "epoch": 2.495894909688013, + "grad_norm": 0.5421269536018372, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 2280 + }, + { + "epoch": 2.5068418171866447, + "grad_norm": 0.5525493025779724, + "learning_rate": 0.0002, + "loss": 0.9637, + "step": 2290 + }, + { + "epoch": 2.5177887246852766, + "grad_norm": 0.6463850140571594, + "learning_rate": 0.0002, + "loss": 0.8075, + "step": 2300 + }, + { + "epoch": 2.528735632183908, + "grad_norm": 0.6861311793327332, + "learning_rate": 0.0002, + "loss": 0.8591, + "step": 2310 + }, + { + "epoch": 2.5396825396825395, + "grad_norm": 0.5563555359840393, + "learning_rate": 0.0002, + "loss": 0.9287, + "step": 2320 + }, + { + "epoch": 2.5506294471811715, + "grad_norm": 0.5721169114112854, + "learning_rate": 0.0002, + "loss": 0.945, + "step": 2330 + }, + { + "epoch": 2.561576354679803, + "grad_norm": 0.5258274674415588, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 2340 + }, + { + "epoch": 2.572523262178435, + "grad_norm": 0.7057380676269531, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 2350 + }, + { + "epoch": 2.5834701696770663, + "grad_norm": 0.6869027614593506, + "learning_rate": 0.0002, + "loss": 0.8615, + "step": 2360 + }, + { + "epoch": 2.594417077175698, + "grad_norm": 0.4960809648036957, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2370 + }, + { + "epoch": 2.6053639846743293, + "grad_norm": 0.9288380146026611, + "learning_rate": 0.0002, + "loss": 0.8476, + "step": 2380 + }, + { + "epoch": 2.616310892172961, + "grad_norm": 0.3765334188938141, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 2390 + }, + { + "epoch": 2.6272577996715927, + "grad_norm": 0.7487865686416626, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 2400 + }, + { + "epoch": 2.6382047071702246, + "grad_norm": 0.6141156554222107, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 2410 + }, + { + "epoch": 2.649151614668856, + "grad_norm": 0.8420507907867432, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 2420 + }, + { + "epoch": 2.6600985221674875, + "grad_norm": 0.53386390209198, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 2430 + }, + { + "epoch": 2.6710454296661195, + "grad_norm": 0.5520607233047485, + "learning_rate": 0.0002, + "loss": 0.8486, + "step": 2440 + }, + { + "epoch": 2.681992337164751, + "grad_norm": 0.5337599515914917, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 2450 + }, + { + "epoch": 2.692939244663383, + "grad_norm": 0.48790836334228516, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 2460 + }, + { + "epoch": 2.7038861521620143, + "grad_norm": 0.8287786245346069, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 2470 + }, + { + "epoch": 2.714833059660646, + "grad_norm": 0.5876168608665466, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 2480 + }, + { + "epoch": 2.7257799671592773, + "grad_norm": 0.5206760764122009, + "learning_rate": 0.0002, + "loss": 0.8773, + "step": 2490 + }, + { + "epoch": 2.736726874657909, + "grad_norm": 0.5619136691093445, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 2500 + }, + { + "epoch": 2.7476737821565407, + "grad_norm": 0.5614883899688721, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 2510 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.6157700419425964, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2520 + }, + { + "epoch": 2.769567597153804, + "grad_norm": 0.5529953837394714, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 2530 + }, + { + "epoch": 2.7805145046524355, + "grad_norm": 0.6731224060058594, + "learning_rate": 0.0002, + "loss": 0.8881, + "step": 2540 + }, + { + "epoch": 2.7914614121510675, + "grad_norm": 0.6960386633872986, + "learning_rate": 0.0002, + "loss": 0.9125, + "step": 2550 + }, + { + "epoch": 2.802408319649699, + "grad_norm": 0.5203493237495422, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 2560 + }, + { + "epoch": 2.813355227148331, + "grad_norm": 1.036837100982666, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 2570 + }, + { + "epoch": 2.8243021346469623, + "grad_norm": 0.6125805377960205, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 2580 + }, + { + "epoch": 2.835249042145594, + "grad_norm": 0.6298092603683472, + "learning_rate": 0.0002, + "loss": 0.8939, + "step": 2590 + }, + { + "epoch": 2.8461959496442253, + "grad_norm": 0.5882203578948975, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2600 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.8619399666786194, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 2610 + }, + { + "epoch": 2.8680897646414887, + "grad_norm": 0.4722687304019928, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 2620 + }, + { + "epoch": 2.8790366721401206, + "grad_norm": 0.47399574518203735, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 2630 + }, + { + "epoch": 2.889983579638752, + "grad_norm": 0.5639172792434692, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2640 + }, + { + "epoch": 2.9009304871373836, + "grad_norm": 0.4676816761493683, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 2650 + }, + { + "epoch": 2.9118773946360155, + "grad_norm": 0.6906291246414185, + "learning_rate": 0.0002, + "loss": 0.8966, + "step": 2660 + }, + { + "epoch": 2.922824302134647, + "grad_norm": 0.4369746148586273, + "learning_rate": 0.0002, + "loss": 0.8966, + "step": 2670 + }, + { + "epoch": 2.933771209633279, + "grad_norm": 0.46423083543777466, + "learning_rate": 0.0002, + "loss": 0.9173, + "step": 2680 + }, + { + "epoch": 2.9447181171319103, + "grad_norm": 0.5700525045394897, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 2690 + }, + { + "epoch": 2.955665024630542, + "grad_norm": 0.6221476793289185, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 2700 + }, + { + "epoch": 2.9666119321291733, + "grad_norm": 0.6102682948112488, + "learning_rate": 0.0002, + "loss": 0.8538, + "step": 2710 + }, + { + "epoch": 2.977558839627805, + "grad_norm": 0.5317878723144531, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 2720 + }, + { + "epoch": 2.9885057471264367, + "grad_norm": 0.4438510835170746, + "learning_rate": 0.0002, + "loss": 0.9314, + "step": 2730 + }, + { + "epoch": 2.9994526546250686, + "grad_norm": 0.5022130012512207, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 2740 + }, + { + "epoch": 2.9994526546250686, + "eval_loss": 1.1722838878631592, + "eval_runtime": 46.0829, + "eval_samples_per_second": 9.461, + "eval_steps_per_second": 1.194, + "step": 2740 + }, + { + "epoch": 3.0103995621237, + "grad_norm": 0.6384502053260803, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 2750 + }, + { + "epoch": 3.0213464696223316, + "grad_norm": 0.9928722381591797, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 2760 + }, + { + "epoch": 3.0322933771209635, + "grad_norm": 0.7813051342964172, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 2770 + }, + { + "epoch": 3.043240284619595, + "grad_norm": 1.0202556848526, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 2780 + }, + { + "epoch": 3.0541871921182264, + "grad_norm": 0.7581062316894531, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 2790 + }, + { + "epoch": 3.0651340996168583, + "grad_norm": 0.6252710223197937, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 2800 + }, + { + "epoch": 3.07608100711549, + "grad_norm": 0.7738662958145142, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 2810 + }, + { + "epoch": 3.0870279146141213, + "grad_norm": 0.7381885051727295, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 2820 + }, + { + "epoch": 3.097974822112753, + "grad_norm": 0.9197564721107483, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 2830 + }, + { + "epoch": 3.1089217296113847, + "grad_norm": 1.000976800918579, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 2840 + }, + { + "epoch": 3.1198686371100166, + "grad_norm": 0.7559131383895874, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 2850 + }, + { + "epoch": 3.130815544608648, + "grad_norm": 0.7213780879974365, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 2860 + }, + { + "epoch": 3.1417624521072796, + "grad_norm": 0.945939838886261, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 2870 + }, + { + "epoch": 3.1527093596059115, + "grad_norm": 0.7277454137802124, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 2880 + }, + { + "epoch": 3.163656267104543, + "grad_norm": 0.762026846408844, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 2890 + }, + { + "epoch": 3.1746031746031744, + "grad_norm": 0.6471221446990967, + "learning_rate": 0.0002, + "loss": 0.5332, + "step": 2900 + }, + { + "epoch": 3.1855500821018063, + "grad_norm": 0.6018978357315063, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 2910 + }, + { + "epoch": 3.196496989600438, + "grad_norm": 0.8607320785522461, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 2920 + }, + { + "epoch": 3.2074438970990693, + "grad_norm": 0.8854126334190369, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 2930 + }, + { + "epoch": 3.218390804597701, + "grad_norm": 0.6620870232582092, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 2940 + }, + { + "epoch": 3.2293377120963327, + "grad_norm": 0.7377511858940125, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 2950 + }, + { + "epoch": 3.2402846195949646, + "grad_norm": 0.7803301811218262, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 2960 + }, + { + "epoch": 3.251231527093596, + "grad_norm": 0.834061861038208, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 2970 + }, + { + "epoch": 3.2621784345922276, + "grad_norm": 0.8496041893959045, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 2980 + }, + { + "epoch": 3.2731253420908595, + "grad_norm": 0.7967984676361084, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 2990 + }, + { + "epoch": 3.284072249589491, + "grad_norm": 1.0207016468048096, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 3000 + }, + { + "epoch": 3.2950191570881224, + "grad_norm": 0.7855771780014038, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 3010 + }, + { + "epoch": 3.3059660645867543, + "grad_norm": 0.9427546858787537, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 3020 + }, + { + "epoch": 3.316912972085386, + "grad_norm": 0.823542594909668, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 3030 + }, + { + "epoch": 3.3278598795840173, + "grad_norm": 0.9826635122299194, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 3040 + }, + { + "epoch": 3.338806787082649, + "grad_norm": 0.7259827852249146, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 3050 + }, + { + "epoch": 3.3497536945812807, + "grad_norm": 0.7774739861488342, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 3060 + }, + { + "epoch": 3.3607006020799126, + "grad_norm": 0.7394293546676636, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 3070 + }, + { + "epoch": 3.371647509578544, + "grad_norm": 0.9017578959465027, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 3080 + }, + { + "epoch": 3.3825944170771756, + "grad_norm": 0.7451054453849792, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 3090 + }, + { + "epoch": 3.3935413245758075, + "grad_norm": 0.7321506142616272, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 3100 + }, + { + "epoch": 3.404488232074439, + "grad_norm": 0.6721828579902649, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 3110 + }, + { + "epoch": 3.4154351395730704, + "grad_norm": 0.774022102355957, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 3120 + }, + { + "epoch": 3.4263820470717024, + "grad_norm": 0.9143537282943726, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 3130 + }, + { + "epoch": 3.437328954570334, + "grad_norm": 1.226087212562561, + "learning_rate": 0.0002, + "loss": 0.6899, + "step": 3140 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 0.7545496225357056, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 3150 + }, + { + "epoch": 3.4592227695675972, + "grad_norm": 0.6515635848045349, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 3160 + }, + { + "epoch": 3.4701696770662287, + "grad_norm": 0.9297090172767639, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 3170 + }, + { + "epoch": 3.4811165845648606, + "grad_norm": 1.0130730867385864, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 3180 + }, + { + "epoch": 3.492063492063492, + "grad_norm": 0.7654589414596558, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 3190 + }, + { + "epoch": 3.5030103995621236, + "grad_norm": 0.9954977631568909, + "learning_rate": 0.0002, + "loss": 0.7401, + "step": 3200 + }, + { + "epoch": 3.5139573070607555, + "grad_norm": 0.6027487516403198, + "learning_rate": 0.0002, + "loss": 0.6661, + "step": 3210 + }, + { + "epoch": 3.524904214559387, + "grad_norm": 0.741770327091217, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 3220 + }, + { + "epoch": 3.535851122058019, + "grad_norm": 1.0534909963607788, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 3230 + }, + { + "epoch": 3.5467980295566504, + "grad_norm": 0.937772274017334, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 3240 + }, + { + "epoch": 3.557744937055282, + "grad_norm": 0.8504213690757751, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 3250 + }, + { + "epoch": 3.5686918445539133, + "grad_norm": 0.7755007147789001, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 3260 + }, + { + "epoch": 3.5796387520525452, + "grad_norm": 1.0193358659744263, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 3270 + }, + { + "epoch": 3.5905856595511767, + "grad_norm": 0.8440536856651306, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 3280 + }, + { + "epoch": 3.6015325670498086, + "grad_norm": 0.6195939183235168, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 3290 + }, + { + "epoch": 3.61247947454844, + "grad_norm": 0.8608590960502625, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 3300 + }, + { + "epoch": 3.6234263820470716, + "grad_norm": 0.6772327423095703, + "learning_rate": 0.0002, + "loss": 0.7301, + "step": 3310 + }, + { + "epoch": 3.6343732895457035, + "grad_norm": 0.8031839728355408, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 3320 + }, + { + "epoch": 3.645320197044335, + "grad_norm": 0.6080502271652222, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 3330 + }, + { + "epoch": 3.656267104542967, + "grad_norm": 0.8007240891456604, + "learning_rate": 0.0002, + "loss": 0.7431, + "step": 3340 + }, + { + "epoch": 3.6672140120415984, + "grad_norm": 0.8060704469680786, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 3350 + }, + { + "epoch": 3.67816091954023, + "grad_norm": 0.7547586560249329, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 3360 + }, + { + "epoch": 3.6891078270388613, + "grad_norm": 0.686851978302002, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 3370 + }, + { + "epoch": 3.7000547345374932, + "grad_norm": 0.9429075717926025, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 3380 + }, + { + "epoch": 3.7110016420361247, + "grad_norm": 0.7283591032028198, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 3390 + }, + { + "epoch": 3.7219485495347566, + "grad_norm": 0.8323085904121399, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 3400 + }, + { + "epoch": 3.732895457033388, + "grad_norm": 0.8529590964317322, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 3410 + }, + { + "epoch": 3.7438423645320196, + "grad_norm": 0.731752872467041, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 3420 + }, + { + "epoch": 3.7547892720306515, + "grad_norm": 0.8572278618812561, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3430 + }, + { + "epoch": 3.765736179529283, + "grad_norm": 0.7408691048622131, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 3440 + }, + { + "epoch": 3.776683087027915, + "grad_norm": 0.7470445036888123, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 3450 + }, + { + "epoch": 3.7876299945265464, + "grad_norm": 0.6806244254112244, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 3460 + }, + { + "epoch": 3.798576902025178, + "grad_norm": 0.9129069447517395, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 3470 + }, + { + "epoch": 3.8095238095238093, + "grad_norm": 0.8717501759529114, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 3480 + }, + { + "epoch": 3.8204707170224412, + "grad_norm": 0.6761979460716248, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 3490 + }, + { + "epoch": 3.8314176245210727, + "grad_norm": 1.0054380893707275, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 3500 + }, + { + "epoch": 3.8423645320197046, + "grad_norm": 1.1224009990692139, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 3510 + }, + { + "epoch": 3.853311439518336, + "grad_norm": 0.8997692465782166, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 3520 + }, + { + "epoch": 3.8642583470169676, + "grad_norm": 1.0086902379989624, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 3530 + }, + { + "epoch": 3.8752052545155995, + "grad_norm": 0.772739589214325, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 3540 + }, + { + "epoch": 3.886152162014231, + "grad_norm": 1.211774230003357, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 3550 + }, + { + "epoch": 3.897099069512863, + "grad_norm": 0.9572356939315796, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 3560 + }, + { + "epoch": 3.9080459770114944, + "grad_norm": 0.7887842655181885, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 3570 + }, + { + "epoch": 3.918992884510126, + "grad_norm": 0.7308389544487, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 3580 + }, + { + "epoch": 3.9299397920087573, + "grad_norm": 1.0182650089263916, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 3590 + }, + { + "epoch": 3.9408866995073892, + "grad_norm": 0.8000147342681885, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 3600 + }, + { + "epoch": 3.9518336070060207, + "grad_norm": 0.7385728359222412, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 3610 + }, + { + "epoch": 3.9627805145046526, + "grad_norm": 0.9233261942863464, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 3620 + }, + { + "epoch": 3.973727422003284, + "grad_norm": 0.8486751914024353, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 3630 + }, + { + "epoch": 3.9846743295019156, + "grad_norm": 0.7593663334846497, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3640 + }, + { + "epoch": 3.9956212370005475, + "grad_norm": 0.7885415554046631, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 3650 + }, + { + "epoch": 4.0, + "eval_loss": 1.250312328338623, + "eval_runtime": 46.0842, + "eval_samples_per_second": 9.461, + "eval_steps_per_second": 1.193, + "step": 3654 + }, + { + "epoch": 4.006568144499179, + "grad_norm": 0.6591703295707703, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 3660 + }, + { + "epoch": 4.017515051997811, + "grad_norm": 1.36927330493927, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 3670 + }, + { + "epoch": 4.028461959496442, + "grad_norm": 0.8106328845024109, + "learning_rate": 0.0002, + "loss": 0.4466, + "step": 3680 + }, + { + "epoch": 4.039408866995074, + "grad_norm": 0.7592712044715881, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 3690 + }, + { + "epoch": 4.050355774493705, + "grad_norm": 0.9518909454345703, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 3700 + }, + { + "epoch": 4.061302681992337, + "grad_norm": 0.7805967330932617, + "learning_rate": 0.0002, + "loss": 0.4638, + "step": 3710 + }, + { + "epoch": 4.072249589490969, + "grad_norm": 1.3146334886550903, + "learning_rate": 0.0002, + "loss": 0.4556, + "step": 3720 + }, + { + "epoch": 4.083196496989601, + "grad_norm": 1.1611138582229614, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 3730 + }, + { + "epoch": 4.094143404488232, + "grad_norm": 0.8173232078552246, + "learning_rate": 0.0002, + "loss": 0.3845, + "step": 3740 + }, + { + "epoch": 4.105090311986864, + "grad_norm": 0.7848323583602905, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 3750 + }, + { + "epoch": 4.116037219485495, + "grad_norm": 1.3183201551437378, + "learning_rate": 0.0002, + "loss": 0.4519, + "step": 3760 + }, + { + "epoch": 4.1269841269841265, + "grad_norm": 1.1936529874801636, + "learning_rate": 0.0002, + "loss": 0.5083, + "step": 3770 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 1.1078993082046509, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 3780 + }, + { + "epoch": 4.14887794198139, + "grad_norm": 1.107743263244629, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 3790 + }, + { + "epoch": 4.159824849480022, + "grad_norm": 0.7801875472068787, + "learning_rate": 0.0002, + "loss": 0.5112, + "step": 3800 + }, + { + "epoch": 4.170771756978653, + "grad_norm": 1.1328117847442627, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 3810 + }, + { + "epoch": 4.181718664477285, + "grad_norm": 1.4232193231582642, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 3820 + }, + { + "epoch": 4.192665571975917, + "grad_norm": 1.557416558265686, + "learning_rate": 0.0002, + "loss": 0.5049, + "step": 3830 + }, + { + "epoch": 4.203612479474549, + "grad_norm": 1.042923092842102, + "learning_rate": 0.0002, + "loss": 0.4863, + "step": 3840 + }, + { + "epoch": 4.21455938697318, + "grad_norm": 1.1801949739456177, + "learning_rate": 0.0002, + "loss": 0.3751, + "step": 3850 + }, + { + "epoch": 4.225506294471812, + "grad_norm": 0.9273753762245178, + "learning_rate": 0.0002, + "loss": 0.5063, + "step": 3860 + }, + { + "epoch": 4.236453201970443, + "grad_norm": 0.7681763768196106, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 3870 + }, + { + "epoch": 4.2474001094690745, + "grad_norm": 0.9840841293334961, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 3880 + }, + { + "epoch": 4.258347016967707, + "grad_norm": 1.0290725231170654, + "learning_rate": 0.0002, + "loss": 0.4648, + "step": 3890 + }, + { + "epoch": 4.269293924466338, + "grad_norm": 0.8059597611427307, + "learning_rate": 0.0002, + "loss": 0.4288, + "step": 3900 + }, + { + "epoch": 4.28024083196497, + "grad_norm": 0.9847467541694641, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 3910 + }, + { + "epoch": 4.291187739463601, + "grad_norm": 1.344044804573059, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 3920 + }, + { + "epoch": 4.302134646962233, + "grad_norm": 0.9174224138259888, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 3930 + }, + { + "epoch": 4.313081554460865, + "grad_norm": 1.1199711561203003, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 3940 + }, + { + "epoch": 4.324028461959497, + "grad_norm": 1.0120296478271484, + "learning_rate": 0.0002, + "loss": 0.4641, + "step": 3950 + }, + { + "epoch": 4.334975369458128, + "grad_norm": 1.091811180114746, + "learning_rate": 0.0002, + "loss": 0.4723, + "step": 3960 + }, + { + "epoch": 4.34592227695676, + "grad_norm": 1.0332133769989014, + "learning_rate": 0.0002, + "loss": 0.4627, + "step": 3970 + }, + { + "epoch": 4.356869184455391, + "grad_norm": 1.0785295963287354, + "learning_rate": 0.0002, + "loss": 0.4646, + "step": 3980 + }, + { + "epoch": 4.3678160919540225, + "grad_norm": 1.0506969690322876, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 3990 + }, + { + "epoch": 4.378762999452655, + "grad_norm": 1.047560691833496, + "learning_rate": 0.0002, + "loss": 0.4776, + "step": 4000 + }, + { + "epoch": 4.389709906951286, + "grad_norm": 0.9348800778388977, + "learning_rate": 0.0002, + "loss": 0.4549, + "step": 4010 + }, + { + "epoch": 4.400656814449918, + "grad_norm": 1.1563059091567993, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 4020 + }, + { + "epoch": 4.411603721948549, + "grad_norm": 1.001470923423767, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 4030 + }, + { + "epoch": 4.422550629447181, + "grad_norm": 1.309012532234192, + "learning_rate": 0.0002, + "loss": 0.4972, + "step": 4040 + }, + { + "epoch": 4.433497536945813, + "grad_norm": 0.7338925004005432, + "learning_rate": 0.0002, + "loss": 0.5078, + "step": 4050 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 1.0398834943771362, + "learning_rate": 0.0002, + "loss": 0.4632, + "step": 4060 + }, + { + "epoch": 4.455391351943076, + "grad_norm": 0.9728689193725586, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 4070 + }, + { + "epoch": 4.466338259441708, + "grad_norm": 1.247475028038025, + "learning_rate": 0.0002, + "loss": 0.4778, + "step": 4080 + }, + { + "epoch": 4.477285166940339, + "grad_norm": 1.1084578037261963, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 4090 + }, + { + "epoch": 4.4882320744389705, + "grad_norm": 1.1619318723678589, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 4100 + }, + { + "epoch": 4.499178981937603, + "grad_norm": 1.3456498384475708, + "learning_rate": 0.0002, + "loss": 0.5207, + "step": 4110 + }, + { + "epoch": 4.510125889436234, + "grad_norm": 0.9372991323471069, + "learning_rate": 0.0002, + "loss": 0.4876, + "step": 4120 + }, + { + "epoch": 4.521072796934866, + "grad_norm": 1.0071815252304077, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 4130 + }, + { + "epoch": 4.532019704433497, + "grad_norm": 1.190344214439392, + "learning_rate": 0.0002, + "loss": 0.5589, + "step": 4140 + }, + { + "epoch": 4.542966611932129, + "grad_norm": 0.9480887055397034, + "learning_rate": 0.0002, + "loss": 0.4852, + "step": 4150 + }, + { + "epoch": 4.553913519430761, + "grad_norm": 1.0252189636230469, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 4160 + }, + { + "epoch": 4.564860426929393, + "grad_norm": 0.7142013311386108, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 4170 + }, + { + "epoch": 4.575807334428024, + "grad_norm": 0.8937426805496216, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 4180 + }, + { + "epoch": 4.586754241926656, + "grad_norm": 0.8885005116462708, + "learning_rate": 0.0002, + "loss": 0.4773, + "step": 4190 + }, + { + "epoch": 4.597701149425287, + "grad_norm": 1.337663173675537, + "learning_rate": 0.0002, + "loss": 0.4858, + "step": 4200 + }, + { + "epoch": 4.6086480569239185, + "grad_norm": 1.0475375652313232, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 4210 + }, + { + "epoch": 4.619594964422551, + "grad_norm": 1.0081088542938232, + "learning_rate": 0.0002, + "loss": 0.5298, + "step": 4220 + }, + { + "epoch": 4.630541871921182, + "grad_norm": 0.7527595162391663, + "learning_rate": 0.0002, + "loss": 0.5042, + "step": 4230 + }, + { + "epoch": 4.641488779419814, + "grad_norm": 1.55559241771698, + "learning_rate": 0.0002, + "loss": 0.5207, + "step": 4240 + }, + { + "epoch": 4.652435686918445, + "grad_norm": 0.7967379689216614, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 4250 + }, + { + "epoch": 4.663382594417077, + "grad_norm": 0.898368775844574, + "learning_rate": 0.0002, + "loss": 0.5328, + "step": 4260 + }, + { + "epoch": 4.674329501915709, + "grad_norm": 1.1940776109695435, + "learning_rate": 0.0002, + "loss": 0.4706, + "step": 4270 + }, + { + "epoch": 4.685276409414341, + "grad_norm": 1.1817092895507812, + "learning_rate": 0.0002, + "loss": 0.5121, + "step": 4280 + }, + { + "epoch": 4.696223316912972, + "grad_norm": 0.9041520357131958, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 4290 + }, + { + "epoch": 4.707170224411604, + "grad_norm": 1.1280102729797363, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 4300 + }, + { + "epoch": 4.718117131910235, + "grad_norm": 1.357689619064331, + "learning_rate": 0.0002, + "loss": 0.4891, + "step": 4310 + }, + { + "epoch": 4.7290640394088665, + "grad_norm": 1.056633472442627, + "learning_rate": 0.0002, + "loss": 0.4704, + "step": 4320 + }, + { + "epoch": 4.740010946907499, + "grad_norm": 1.6520427465438843, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 4330 + }, + { + "epoch": 4.75095785440613, + "grad_norm": 1.153200626373291, + "learning_rate": 0.0002, + "loss": 0.5131, + "step": 4340 + }, + { + "epoch": 4.761904761904762, + "grad_norm": 0.9346241354942322, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 4350 + }, + { + "epoch": 4.772851669403393, + "grad_norm": 0.8628455996513367, + "learning_rate": 0.0002, + "loss": 0.4941, + "step": 4360 + }, + { + "epoch": 4.783798576902025, + "grad_norm": 1.3843916654586792, + "learning_rate": 0.0002, + "loss": 0.5167, + "step": 4370 + }, + { + "epoch": 4.794745484400657, + "grad_norm": 1.035574197769165, + "learning_rate": 0.0002, + "loss": 0.4683, + "step": 4380 + }, + { + "epoch": 4.805692391899289, + "grad_norm": 1.1868361234664917, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 4390 + }, + { + "epoch": 4.81663929939792, + "grad_norm": 1.1307647228240967, + "learning_rate": 0.0002, + "loss": 0.534, + "step": 4400 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 0.9787724614143372, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 4410 + }, + { + "epoch": 4.838533114395183, + "grad_norm": 1.0473824739456177, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 4420 + }, + { + "epoch": 4.8494800218938146, + "grad_norm": 1.069069504737854, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 4430 + }, + { + "epoch": 4.860426929392447, + "grad_norm": 1.4305680990219116, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 4440 + }, + { + "epoch": 4.871373836891078, + "grad_norm": 1.3679203987121582, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 4450 + }, + { + "epoch": 4.88232074438971, + "grad_norm": 0.8997844457626343, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 4460 + }, + { + "epoch": 4.893267651888341, + "grad_norm": 1.2758110761642456, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 4470 + }, + { + "epoch": 4.904214559386973, + "grad_norm": 0.8819465637207031, + "learning_rate": 0.0002, + "loss": 0.4914, + "step": 4480 + }, + { + "epoch": 4.915161466885605, + "grad_norm": 1.08329439163208, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 4490 + }, + { + "epoch": 4.926108374384237, + "grad_norm": 1.083461046218872, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 4500 + }, + { + "epoch": 4.937055281882868, + "grad_norm": 1.2387723922729492, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 4510 + }, + { + "epoch": 4.9480021893815, + "grad_norm": 0.8262293934822083, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 4520 + }, + { + "epoch": 4.958949096880131, + "grad_norm": 1.2325191497802734, + "learning_rate": 0.0002, + "loss": 0.504, + "step": 4530 + }, + { + "epoch": 4.9698960043787626, + "grad_norm": 1.024614930152893, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 4540 + }, + { + "epoch": 4.980842911877395, + "grad_norm": 1.3007521629333496, + "learning_rate": 0.0002, + "loss": 0.4752, + "step": 4550 + }, + { + "epoch": 4.991789819376026, + "grad_norm": 0.9823828339576721, + "learning_rate": 0.0002, + "loss": 0.4943, + "step": 4560 + }, + { + "epoch": 4.999452654625069, + "eval_loss": 1.3920727968215942, + "eval_runtime": 46.0764, + "eval_samples_per_second": 9.463, + "eval_steps_per_second": 1.194, + "step": 4567 + }, + { + "epoch": 5.002736726874658, + "grad_norm": 1.1478906869888306, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 4570 + }, + { + "epoch": 5.013683634373289, + "grad_norm": 1.0533705949783325, + "learning_rate": 0.0002, + "loss": 0.372, + "step": 4580 + }, + { + "epoch": 5.024630541871921, + "grad_norm": 1.268900752067566, + "learning_rate": 0.0002, + "loss": 0.3313, + "step": 4590 + }, + { + "epoch": 5.035577449370553, + "grad_norm": 1.222652554512024, + "learning_rate": 0.0002, + "loss": 0.3482, + "step": 4600 + }, + { + "epoch": 5.046524356869185, + "grad_norm": 1.5093127489089966, + "learning_rate": 0.0002, + "loss": 0.3195, + "step": 4610 + }, + { + "epoch": 5.057471264367816, + "grad_norm": 1.2372499704360962, + "learning_rate": 0.0002, + "loss": 0.3569, + "step": 4620 + }, + { + "epoch": 5.068418171866448, + "grad_norm": 0.8422666192054749, + "learning_rate": 0.0002, + "loss": 0.3206, + "step": 4630 + }, + { + "epoch": 5.079365079365079, + "grad_norm": 1.1451770067214966, + "learning_rate": 0.0002, + "loss": 0.3115, + "step": 4640 + }, + { + "epoch": 5.090311986863711, + "grad_norm": 1.2074557542800903, + "learning_rate": 0.0002, + "loss": 0.3305, + "step": 4650 + }, + { + "epoch": 5.101258894362343, + "grad_norm": 1.429150104522705, + "learning_rate": 0.0002, + "loss": 0.3012, + "step": 4660 + }, + { + "epoch": 5.112205801860974, + "grad_norm": 1.0353610515594482, + "learning_rate": 0.0002, + "loss": 0.3229, + "step": 4670 + }, + { + "epoch": 5.123152709359606, + "grad_norm": 1.2845979928970337, + "learning_rate": 0.0002, + "loss": 0.402, + "step": 4680 + }, + { + "epoch": 5.134099616858237, + "grad_norm": 1.3790186643600464, + "learning_rate": 0.0002, + "loss": 0.383, + "step": 4690 + }, + { + "epoch": 5.145046524356869, + "grad_norm": 1.3182239532470703, + "learning_rate": 0.0002, + "loss": 0.2951, + "step": 4700 + }, + { + "epoch": 5.155993431855501, + "grad_norm": 1.5249626636505127, + "learning_rate": 0.0002, + "loss": 0.4074, + "step": 4710 + }, + { + "epoch": 5.166940339354133, + "grad_norm": 1.2492733001708984, + "learning_rate": 0.0002, + "loss": 0.3703, + "step": 4720 + }, + { + "epoch": 5.177887246852764, + "grad_norm": 1.4455480575561523, + "learning_rate": 0.0002, + "loss": 0.3411, + "step": 4730 + }, + { + "epoch": 5.188834154351396, + "grad_norm": 1.2191482782363892, + "learning_rate": 0.0002, + "loss": 0.3996, + "step": 4740 + }, + { + "epoch": 5.199781061850027, + "grad_norm": 1.4707951545715332, + "learning_rate": 0.0002, + "loss": 0.3785, + "step": 4750 + }, + { + "epoch": 5.210727969348659, + "grad_norm": 1.3473678827285767, + "learning_rate": 0.0002, + "loss": 0.3516, + "step": 4760 + }, + { + "epoch": 5.221674876847291, + "grad_norm": 1.0479670763015747, + "learning_rate": 0.0002, + "loss": 0.3266, + "step": 4770 + }, + { + "epoch": 5.232621784345922, + "grad_norm": 1.299096703529358, + "learning_rate": 0.0002, + "loss": 0.3976, + "step": 4780 + }, + { + "epoch": 5.243568691844554, + "grad_norm": 1.2820168733596802, + "learning_rate": 0.0002, + "loss": 0.3266, + "step": 4790 + }, + { + "epoch": 5.254515599343185, + "grad_norm": 1.3818004131317139, + "learning_rate": 0.0002, + "loss": 0.3347, + "step": 4800 + }, + { + "epoch": 5.265462506841817, + "grad_norm": 1.2898736000061035, + "learning_rate": 0.0002, + "loss": 0.3761, + "step": 4810 + }, + { + "epoch": 5.276409414340449, + "grad_norm": 1.1761468648910522, + "learning_rate": 0.0002, + "loss": 0.3694, + "step": 4820 + }, + { + "epoch": 5.287356321839081, + "grad_norm": 1.7155952453613281, + "learning_rate": 0.0002, + "loss": 0.3806, + "step": 4830 + }, + { + "epoch": 5.298303229337712, + "grad_norm": 0.9103642106056213, + "learning_rate": 0.0002, + "loss": 0.322, + "step": 4840 + }, + { + "epoch": 5.309250136836344, + "grad_norm": 1.013015627861023, + "learning_rate": 0.0002, + "loss": 0.3516, + "step": 4850 + }, + { + "epoch": 5.320197044334975, + "grad_norm": 1.390471339225769, + "learning_rate": 0.0002, + "loss": 0.4297, + "step": 4860 + }, + { + "epoch": 5.331143951833607, + "grad_norm": 1.129770278930664, + "learning_rate": 0.0002, + "loss": 0.4098, + "step": 4870 + }, + { + "epoch": 5.342090859332239, + "grad_norm": 1.1461067199707031, + "learning_rate": 0.0002, + "loss": 0.4227, + "step": 4880 + }, + { + "epoch": 5.35303776683087, + "grad_norm": 1.3587424755096436, + "learning_rate": 0.0002, + "loss": 0.288, + "step": 4890 + }, + { + "epoch": 5.363984674329502, + "grad_norm": 1.6897879838943481, + "learning_rate": 0.0002, + "loss": 0.3604, + "step": 4900 + }, + { + "epoch": 5.374931581828133, + "grad_norm": 0.9298055768013, + "learning_rate": 0.0002, + "loss": 0.3887, + "step": 4910 + }, + { + "epoch": 5.385878489326765, + "grad_norm": 1.0006917715072632, + "learning_rate": 0.0002, + "loss": 0.3371, + "step": 4920 + }, + { + "epoch": 5.396825396825397, + "grad_norm": 1.232581377029419, + "learning_rate": 0.0002, + "loss": 0.3992, + "step": 4930 + }, + { + "epoch": 5.407772304324029, + "grad_norm": 1.0822620391845703, + "learning_rate": 0.0002, + "loss": 0.3456, + "step": 4940 + }, + { + "epoch": 5.41871921182266, + "grad_norm": 1.3648720979690552, + "learning_rate": 0.0002, + "loss": 0.3806, + "step": 4950 + }, + { + "epoch": 5.429666119321292, + "grad_norm": 1.3220354318618774, + "learning_rate": 0.0002, + "loss": 0.3959, + "step": 4960 + }, + { + "epoch": 5.440613026819923, + "grad_norm": 1.1106271743774414, + "learning_rate": 0.0002, + "loss": 0.3278, + "step": 4970 + }, + { + "epoch": 5.451559934318555, + "grad_norm": 1.6058908700942993, + "learning_rate": 0.0002, + "loss": 0.3812, + "step": 4980 + }, + { + "epoch": 5.462506841817187, + "grad_norm": 1.1065930128097534, + "learning_rate": 0.0002, + "loss": 0.3905, + "step": 4990 + }, + { + "epoch": 5.473453749315818, + "grad_norm": 1.3896466493606567, + "learning_rate": 0.0002, + "loss": 0.4058, + "step": 5000 + }, + { + "epoch": 5.48440065681445, + "grad_norm": 1.0437148809432983, + "learning_rate": 0.0002, + "loss": 0.4122, + "step": 5010 + }, + { + "epoch": 5.495347564313081, + "grad_norm": 1.2347718477249146, + "learning_rate": 0.0002, + "loss": 0.4065, + "step": 5020 + }, + { + "epoch": 5.506294471811713, + "grad_norm": 1.1174284219741821, + "learning_rate": 0.0002, + "loss": 0.3586, + "step": 5030 + }, + { + "epoch": 5.517241379310345, + "grad_norm": 1.2580941915512085, + "learning_rate": 0.0002, + "loss": 0.3576, + "step": 5040 + }, + { + "epoch": 5.528188286808977, + "grad_norm": 1.451090931892395, + "learning_rate": 0.0002, + "loss": 0.3809, + "step": 5050 + }, + { + "epoch": 5.539135194307608, + "grad_norm": 1.4688365459442139, + "learning_rate": 0.0002, + "loss": 0.3645, + "step": 5060 + }, + { + "epoch": 5.55008210180624, + "grad_norm": 1.1625734567642212, + "learning_rate": 0.0002, + "loss": 0.4431, + "step": 5070 + }, + { + "epoch": 5.561029009304871, + "grad_norm": 0.9332265257835388, + "learning_rate": 0.0002, + "loss": 0.3972, + "step": 5080 + }, + { + "epoch": 5.571975916803503, + "grad_norm": 1.5635273456573486, + "learning_rate": 0.0002, + "loss": 0.4, + "step": 5090 + }, + { + "epoch": 5.582922824302135, + "grad_norm": 1.3420509099960327, + "learning_rate": 0.0002, + "loss": 0.3651, + "step": 5100 + }, + { + "epoch": 5.593869731800766, + "grad_norm": 1.5826557874679565, + "learning_rate": 0.0002, + "loss": 0.3717, + "step": 5110 + }, + { + "epoch": 5.604816639299398, + "grad_norm": 1.5737065076828003, + "learning_rate": 0.0002, + "loss": 0.4256, + "step": 5120 + }, + { + "epoch": 5.615763546798029, + "grad_norm": 1.3812499046325684, + "learning_rate": 0.0002, + "loss": 0.39, + "step": 5130 + }, + { + "epoch": 5.626710454296661, + "grad_norm": 1.362833023071289, + "learning_rate": 0.0002, + "loss": 0.3891, + "step": 5140 + }, + { + "epoch": 5.637657361795293, + "grad_norm": 1.7667874097824097, + "learning_rate": 0.0002, + "loss": 0.455, + "step": 5150 + }, + { + "epoch": 5.648604269293925, + "grad_norm": 1.2661789655685425, + "learning_rate": 0.0002, + "loss": 0.4264, + "step": 5160 + }, + { + "epoch": 5.659551176792556, + "grad_norm": 1.2076870203018188, + "learning_rate": 0.0002, + "loss": 0.3261, + "step": 5170 + }, + { + "epoch": 5.670498084291188, + "grad_norm": 1.2431524991989136, + "learning_rate": 0.0002, + "loss": 0.372, + "step": 5180 + }, + { + "epoch": 5.681444991789819, + "grad_norm": 1.2216639518737793, + "learning_rate": 0.0002, + "loss": 0.4092, + "step": 5190 + }, + { + "epoch": 5.692391899288451, + "grad_norm": 0.9259352684020996, + "learning_rate": 0.0002, + "loss": 0.4171, + "step": 5200 + }, + { + "epoch": 5.703338806787083, + "grad_norm": 1.7929338216781616, + "learning_rate": 0.0002, + "loss": 0.3875, + "step": 5210 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 1.4048460721969604, + "learning_rate": 0.0002, + "loss": 0.4424, + "step": 5220 + }, + { + "epoch": 5.725232621784346, + "grad_norm": 1.306874394416809, + "learning_rate": 0.0002, + "loss": 0.3758, + "step": 5230 + }, + { + "epoch": 5.736179529282977, + "grad_norm": 1.3137940168380737, + "learning_rate": 0.0002, + "loss": 0.3889, + "step": 5240 + }, + { + "epoch": 5.747126436781609, + "grad_norm": 1.1376476287841797, + "learning_rate": 0.0002, + "loss": 0.4804, + "step": 5250 + }, + { + "epoch": 5.758073344280241, + "grad_norm": 1.450939416885376, + "learning_rate": 0.0002, + "loss": 0.377, + "step": 5260 + }, + { + "epoch": 5.769020251778873, + "grad_norm": 0.983195960521698, + "learning_rate": 0.0002, + "loss": 0.4732, + "step": 5270 + }, + { + "epoch": 5.779967159277504, + "grad_norm": 1.66558837890625, + "learning_rate": 0.0002, + "loss": 0.4041, + "step": 5280 + }, + { + "epoch": 5.790914066776136, + "grad_norm": 0.9789204597473145, + "learning_rate": 0.0002, + "loss": 0.3643, + "step": 5290 + }, + { + "epoch": 5.801860974274767, + "grad_norm": 1.2110556364059448, + "learning_rate": 0.0002, + "loss": 0.3776, + "step": 5300 + }, + { + "epoch": 5.812807881773399, + "grad_norm": 1.3799304962158203, + "learning_rate": 0.0002, + "loss": 0.4049, + "step": 5310 + }, + { + "epoch": 5.823754789272031, + "grad_norm": 1.0570626258850098, + "learning_rate": 0.0002, + "loss": 0.4362, + "step": 5320 + }, + { + "epoch": 5.834701696770662, + "grad_norm": 1.4654436111450195, + "learning_rate": 0.0002, + "loss": 0.4716, + "step": 5330 + }, + { + "epoch": 5.845648604269294, + "grad_norm": 1.5216940641403198, + "learning_rate": 0.0002, + "loss": 0.4048, + "step": 5340 + }, + { + "epoch": 5.856595511767925, + "grad_norm": 1.018646001815796, + "learning_rate": 0.0002, + "loss": 0.3848, + "step": 5350 + }, + { + "epoch": 5.867542419266557, + "grad_norm": 1.028951644897461, + "learning_rate": 0.0002, + "loss": 0.3705, + "step": 5360 + }, + { + "epoch": 5.878489326765189, + "grad_norm": 2.571263313293457, + "learning_rate": 0.0002, + "loss": 0.4213, + "step": 5370 + }, + { + "epoch": 5.889436234263821, + "grad_norm": 1.3323984146118164, + "learning_rate": 0.0002, + "loss": 0.3647, + "step": 5380 + }, + { + "epoch": 5.900383141762452, + "grad_norm": 1.4317777156829834, + "learning_rate": 0.0002, + "loss": 0.4085, + "step": 5390 + }, + { + "epoch": 5.911330049261084, + "grad_norm": 1.4289140701293945, + "learning_rate": 0.0002, + "loss": 0.4254, + "step": 5400 + }, + { + "epoch": 5.922276956759715, + "grad_norm": 1.3130780458450317, + "learning_rate": 0.0002, + "loss": 0.3993, + "step": 5410 + }, + { + "epoch": 5.933223864258347, + "grad_norm": 1.3979902267456055, + "learning_rate": 0.0002, + "loss": 0.4025, + "step": 5420 + }, + { + "epoch": 5.944170771756979, + "grad_norm": 1.1827352046966553, + "learning_rate": 0.0002, + "loss": 0.3997, + "step": 5430 + }, + { + "epoch": 5.95511767925561, + "grad_norm": 1.1672080755233765, + "learning_rate": 0.0002, + "loss": 0.4163, + "step": 5440 + }, + { + "epoch": 5.966064586754242, + "grad_norm": 1.0949620008468628, + "learning_rate": 0.0002, + "loss": 0.4425, + "step": 5450 + }, + { + "epoch": 5.977011494252873, + "grad_norm": 1.3183925151824951, + "learning_rate": 0.0002, + "loss": 0.4219, + "step": 5460 + }, + { + "epoch": 5.987958401751505, + "grad_norm": 1.096198320388794, + "learning_rate": 0.0002, + "loss": 0.4171, + "step": 5470 + }, + { + "epoch": 5.998905309250137, + "grad_norm": 1.2601423263549805, + "learning_rate": 0.0002, + "loss": 0.3886, + "step": 5480 + }, + { + "epoch": 6.0, + "eval_loss": 1.611358880996704, + "eval_runtime": 46.0638, + "eval_samples_per_second": 9.465, + "eval_steps_per_second": 1.194, + "step": 5481 + } + ], + "logging_steps": 10, + "max_steps": 7304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.8152468978794496e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..895406585c61ed7f5a6760976fc06a40a6fd8b3b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-5481/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a158bca77bc6d58bfffd2acd2e9e641d481e1857fcbdd68fee875b74797abbc4 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..963c74622eea838273ac83d283aa57de83708ce3 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb59499751e431b269a158e9577079b7719b9d361e6a6d75b69339e4bd45608f +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6507afe6b978b0d95da2941b604aef718ef82e8a --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe43d0b9cd92304f1825951897180e94734f0f9154fb25ea7086284cba22f982 +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..257580bde1c6d381934a137c263206d24659d999 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:701709a539d7e9af65f1575bb9d1da237851fa2992973f27c5eb8eacb440fc72 +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e23496075950221d743aae990e9f723697f33446 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b1ec7cfbdc44640c90cecdfe9354667f6017ab26314148e5d93c1ae755b81ed +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..68803fbe4da3716ec3d1df6fae2887a2e349a160 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/trainer_state.json @@ -0,0 +1,4562 @@ +{ + "best_metric": 1.14472496509552, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827", + "epoch": 6.999452654625069, + "eval_steps": 10, + "global_step": 6394, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.010946907498631636, + "grad_norm": 0.7611560821533203, + "learning_rate": 0.0002, + "loss": 2.4039, + "step": 10 + }, + { + "epoch": 0.021893814997263273, + "grad_norm": 0.4633193612098694, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 20 + }, + { + "epoch": 0.03284072249589491, + "grad_norm": 0.49326154589653015, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 30 + }, + { + "epoch": 0.043787629994526546, + "grad_norm": 0.3943138122558594, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 40 + }, + { + "epoch": 0.05473453749315818, + "grad_norm": 0.43292930722236633, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 50 + }, + { + "epoch": 0.06568144499178982, + "grad_norm": 0.3431817591190338, + "learning_rate": 0.0002, + "loss": 1.2427, + "step": 60 + }, + { + "epoch": 0.07662835249042145, + "grad_norm": 0.38011446595191956, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 70 + }, + { + "epoch": 0.08757525998905309, + "grad_norm": 0.7441071271896362, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 80 + }, + { + "epoch": 0.09852216748768473, + "grad_norm": 0.3359833061695099, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 90 + }, + { + "epoch": 0.10946907498631636, + "grad_norm": 0.3724392354488373, + "learning_rate": 0.0002, + "loss": 1.2774, + "step": 100 + }, + { + "epoch": 0.120415982484948, + "grad_norm": 0.40673762559890747, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 110 + }, + { + "epoch": 0.13136288998357964, + "grad_norm": 0.40036800503730774, + "learning_rate": 0.0002, + "loss": 1.2611, + "step": 120 + }, + { + "epoch": 0.1423097974822113, + "grad_norm": 2.844191312789917, + "learning_rate": 0.0002, + "loss": 1.2436, + "step": 130 + }, + { + "epoch": 0.1532567049808429, + "grad_norm": 0.3104734420776367, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 140 + }, + { + "epoch": 0.16420361247947454, + "grad_norm": 0.3266797959804535, + "learning_rate": 0.0002, + "loss": 1.0498, + "step": 150 + }, + { + "epoch": 0.17515051997810618, + "grad_norm": 0.3079199194908142, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 160 + }, + { + "epoch": 0.18609742747673783, + "grad_norm": 0.3872479498386383, + "learning_rate": 0.0002, + "loss": 1.2923, + "step": 170 + }, + { + "epoch": 0.19704433497536947, + "grad_norm": 0.38654500246047974, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 180 + }, + { + "epoch": 0.20799124247400108, + "grad_norm": 0.2913552522659302, + "learning_rate": 0.0002, + "loss": 1.066, + "step": 190 + }, + { + "epoch": 0.21893814997263272, + "grad_norm": 0.2960572838783264, + "learning_rate": 0.0002, + "loss": 1.0868, + "step": 200 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 0.5175501108169556, + "learning_rate": 0.0002, + "loss": 1.136, + "step": 210 + }, + { + "epoch": 0.240831964969896, + "grad_norm": 1.2921574115753174, + "learning_rate": 0.0002, + "loss": 1.1556, + "step": 220 + }, + { + "epoch": 0.25177887246852765, + "grad_norm": 0.2675512135028839, + "learning_rate": 0.0002, + "loss": 1.1488, + "step": 230 + }, + { + "epoch": 0.2627257799671593, + "grad_norm": 0.3956190049648285, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 240 + }, + { + "epoch": 0.27367268746579093, + "grad_norm": 0.6022581458091736, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 250 + }, + { + "epoch": 0.2846195949644226, + "grad_norm": 1.1949563026428223, + "learning_rate": 0.0002, + "loss": 1.1981, + "step": 260 + }, + { + "epoch": 0.2955665024630542, + "grad_norm": 0.31173548102378845, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 270 + }, + { + "epoch": 0.3065134099616858, + "grad_norm": 0.2808472812175751, + "learning_rate": 0.0002, + "loss": 1.06, + "step": 280 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 0.3042023777961731, + "learning_rate": 0.0002, + "loss": 1.1752, + "step": 290 + }, + { + "epoch": 0.3284072249589491, + "grad_norm": 0.39915043115615845, + "learning_rate": 0.0002, + "loss": 1.3147, + "step": 300 + }, + { + "epoch": 0.3393541324575807, + "grad_norm": 0.39118197560310364, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 310 + }, + { + "epoch": 0.35030103995621237, + "grad_norm": 0.355010986328125, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 320 + }, + { + "epoch": 0.361247947454844, + "grad_norm": 0.29734086990356445, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 330 + }, + { + "epoch": 0.37219485495347565, + "grad_norm": 0.346096009016037, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 340 + }, + { + "epoch": 0.3831417624521073, + "grad_norm": 0.4829643666744232, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 350 + }, + { + "epoch": 0.39408866995073893, + "grad_norm": 0.4726872742176056, + "learning_rate": 0.0002, + "loss": 1.2808, + "step": 360 + }, + { + "epoch": 0.4050355774493706, + "grad_norm": 0.3130153715610504, + "learning_rate": 0.0002, + "loss": 1.1532, + "step": 370 + }, + { + "epoch": 0.41598248494800216, + "grad_norm": 0.5123590230941772, + "learning_rate": 0.0002, + "loss": 1.1842, + "step": 380 + }, + { + "epoch": 0.4269293924466338, + "grad_norm": 0.3444574773311615, + "learning_rate": 0.0002, + "loss": 1.1539, + "step": 390 + }, + { + "epoch": 0.43787629994526545, + "grad_norm": 0.5302175283432007, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 400 + }, + { + "epoch": 0.4488232074438971, + "grad_norm": 0.2713572680950165, + "learning_rate": 0.0002, + "loss": 1.1138, + "step": 410 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 0.29530611634254456, + "learning_rate": 0.0002, + "loss": 1.1281, + "step": 420 + }, + { + "epoch": 0.47071702244116037, + "grad_norm": 0.27282455563545227, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 430 + }, + { + "epoch": 0.481663929939792, + "grad_norm": 0.2647949755191803, + "learning_rate": 0.0002, + "loss": 1.1213, + "step": 440 + }, + { + "epoch": 0.49261083743842365, + "grad_norm": 0.35509347915649414, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 450 + }, + { + "epoch": 0.5035577449370553, + "grad_norm": 0.1959609091281891, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 460 + }, + { + "epoch": 0.5145046524356869, + "grad_norm": 0.40090155601501465, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 470 + }, + { + "epoch": 0.5254515599343186, + "grad_norm": 0.3354604244232178, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 480 + }, + { + "epoch": 0.5363984674329502, + "grad_norm": 0.2758506238460541, + "learning_rate": 0.0002, + "loss": 1.1024, + "step": 490 + }, + { + "epoch": 0.5473453749315819, + "grad_norm": 0.3629051744937897, + "learning_rate": 0.0002, + "loss": 1.1108, + "step": 500 + }, + { + "epoch": 0.5582922824302134, + "grad_norm": 0.30802229046821594, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 510 + }, + { + "epoch": 0.5692391899288451, + "grad_norm": 0.3099463880062103, + "learning_rate": 0.0002, + "loss": 1.0424, + "step": 520 + }, + { + "epoch": 0.5801860974274767, + "grad_norm": 0.42299067974090576, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 530 + }, + { + "epoch": 0.5911330049261084, + "grad_norm": 0.5392252802848816, + "learning_rate": 0.0002, + "loss": 1.1698, + "step": 540 + }, + { + "epoch": 0.60207991242474, + "grad_norm": 0.34768250584602356, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 550 + }, + { + "epoch": 0.6130268199233716, + "grad_norm": 0.28490015864372253, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 560 + }, + { + "epoch": 0.6239737274220033, + "grad_norm": 0.34787994623184204, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 570 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.29058772325515747, + "learning_rate": 0.0002, + "loss": 1.0759, + "step": 580 + }, + { + "epoch": 0.6458675424192666, + "grad_norm": 0.4063778817653656, + "learning_rate": 0.0002, + "loss": 1.1157, + "step": 590 + }, + { + "epoch": 0.6568144499178982, + "grad_norm": 0.9244267344474792, + "learning_rate": 0.0002, + "loss": 1.1432, + "step": 600 + }, + { + "epoch": 0.6677613574165299, + "grad_norm": 0.27605190873146057, + "learning_rate": 0.0002, + "loss": 1.0591, + "step": 610 + }, + { + "epoch": 0.6787082649151615, + "grad_norm": 0.34346821904182434, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 620 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.3093279302120209, + "learning_rate": 0.0002, + "loss": 1.2195, + "step": 630 + }, + { + "epoch": 0.7006020799124247, + "grad_norm": 1.0069009065628052, + "learning_rate": 0.0002, + "loss": 1.2461, + "step": 640 + }, + { + "epoch": 0.7115489874110563, + "grad_norm": 0.5049130916595459, + "learning_rate": 0.0002, + "loss": 1.0892, + "step": 650 + }, + { + "epoch": 0.722495894909688, + "grad_norm": 0.3748924732208252, + "learning_rate": 0.0002, + "loss": 1.1404, + "step": 660 + }, + { + "epoch": 0.7334428024083196, + "grad_norm": 0.2964959144592285, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 670 + }, + { + "epoch": 0.7443897099069513, + "grad_norm": 0.4599986970424652, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 680 + }, + { + "epoch": 0.7553366174055829, + "grad_norm": 0.27292951941490173, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 690 + }, + { + "epoch": 0.7662835249042146, + "grad_norm": 0.3123566806316376, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 700 + }, + { + "epoch": 0.7772304324028462, + "grad_norm": 0.28310710191726685, + "learning_rate": 0.0002, + "loss": 1.0021, + "step": 710 + }, + { + "epoch": 0.7881773399014779, + "grad_norm": 0.3279992341995239, + "learning_rate": 0.0002, + "loss": 1.2281, + "step": 720 + }, + { + "epoch": 0.7991242474001095, + "grad_norm": 0.28179168701171875, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 730 + }, + { + "epoch": 0.8100711548987412, + "grad_norm": 0.31492987275123596, + "learning_rate": 0.0002, + "loss": 1.0602, + "step": 740 + }, + { + "epoch": 0.8210180623973727, + "grad_norm": 0.41821011900901794, + "learning_rate": 0.0002, + "loss": 1.2518, + "step": 750 + }, + { + "epoch": 0.8319649698960043, + "grad_norm": 0.325235515832901, + "learning_rate": 0.0002, + "loss": 1.1612, + "step": 760 + }, + { + "epoch": 0.842911877394636, + "grad_norm": 0.5366070866584778, + "learning_rate": 0.0002, + "loss": 1.27, + "step": 770 + }, + { + "epoch": 0.8538587848932676, + "grad_norm": 0.32570579648017883, + "learning_rate": 0.0002, + "loss": 1.0921, + "step": 780 + }, + { + "epoch": 0.8648056923918993, + "grad_norm": 0.3642968237400055, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 790 + }, + { + "epoch": 0.8757525998905309, + "grad_norm": 0.29713448882102966, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 800 + }, + { + "epoch": 0.8866995073891626, + "grad_norm": 0.23599444329738617, + "learning_rate": 0.0002, + "loss": 1.0978, + "step": 810 + }, + { + "epoch": 0.8976464148877942, + "grad_norm": 0.31522464752197266, + "learning_rate": 0.0002, + "loss": 1.1867, + "step": 820 + }, + { + "epoch": 0.9085933223864259, + "grad_norm": 0.32754790782928467, + "learning_rate": 0.0002, + "loss": 1.0208, + "step": 830 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 0.22741089761257172, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 840 + }, + { + "epoch": 0.9304871373836892, + "grad_norm": 0.3089679777622223, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 850 + }, + { + "epoch": 0.9414340448823207, + "grad_norm": 0.27440521121025085, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 860 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.3498363792896271, + "learning_rate": 0.0002, + "loss": 1.0417, + "step": 870 + }, + { + "epoch": 0.963327859879584, + "grad_norm": 0.47151854634284973, + "learning_rate": 0.0002, + "loss": 1.269, + "step": 880 + }, + { + "epoch": 0.9742747673782156, + "grad_norm": 0.24756591022014618, + "learning_rate": 0.0002, + "loss": 1.1174, + "step": 890 + }, + { + "epoch": 0.9852216748768473, + "grad_norm": 0.2600938677787781, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 900 + }, + { + "epoch": 0.9961685823754789, + "grad_norm": 0.2934586703777313, + "learning_rate": 0.0002, + "loss": 1.0968, + "step": 910 + }, + { + "epoch": 0.9994526546250684, + "eval_loss": 1.158464789390564, + "eval_runtime": 46.0774, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 1.194, + "step": 913 + }, + { + "epoch": 1.0071154898741106, + "grad_norm": 0.37776654958724976, + "learning_rate": 0.0002, + "loss": 1.0322, + "step": 920 + }, + { + "epoch": 1.0180623973727423, + "grad_norm": 0.31784629821777344, + "learning_rate": 0.0002, + "loss": 1.1346, + "step": 930 + }, + { + "epoch": 1.0290093048713738, + "grad_norm": 0.24244336783885956, + "learning_rate": 0.0002, + "loss": 0.8976, + "step": 940 + }, + { + "epoch": 1.0399562123700055, + "grad_norm": 0.3185454308986664, + "learning_rate": 0.0002, + "loss": 1.0172, + "step": 950 + }, + { + "epoch": 1.0509031198686372, + "grad_norm": 0.3589441478252411, + "learning_rate": 0.0002, + "loss": 1.1645, + "step": 960 + }, + { + "epoch": 1.0618500273672686, + "grad_norm": 0.38593578338623047, + "learning_rate": 0.0002, + "loss": 1.0317, + "step": 970 + }, + { + "epoch": 1.0727969348659003, + "grad_norm": 0.39694955945014954, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 980 + }, + { + "epoch": 1.083743842364532, + "grad_norm": 0.469817191362381, + "learning_rate": 0.0002, + "loss": 1.0893, + "step": 990 + }, + { + "epoch": 1.0946907498631637, + "grad_norm": 0.2634755074977875, + "learning_rate": 0.0002, + "loss": 0.983, + "step": 1000 + }, + { + "epoch": 1.1056376573617952, + "grad_norm": 0.43189436197280884, + "learning_rate": 0.0002, + "loss": 1.0144, + "step": 1010 + }, + { + "epoch": 1.116584564860427, + "grad_norm": 0.5559977889060974, + "learning_rate": 0.0002, + "loss": 0.9663, + "step": 1020 + }, + { + "epoch": 1.1275314723590586, + "grad_norm": 0.32100191712379456, + "learning_rate": 0.0002, + "loss": 1.0481, + "step": 1030 + }, + { + "epoch": 1.1384783798576903, + "grad_norm": 0.40179768204689026, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 1040 + }, + { + "epoch": 1.1494252873563218, + "grad_norm": 0.3659493029117584, + "learning_rate": 0.0002, + "loss": 1.0029, + "step": 1050 + }, + { + "epoch": 1.1603721948549535, + "grad_norm": 0.701704204082489, + "learning_rate": 0.0002, + "loss": 0.9597, + "step": 1060 + }, + { + "epoch": 1.1713191023535852, + "grad_norm": 0.3650563359260559, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 1070 + }, + { + "epoch": 1.1822660098522166, + "grad_norm": 0.3191976249217987, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 1080 + }, + { + "epoch": 1.1932129173508483, + "grad_norm": 0.3615441918373108, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 1090 + }, + { + "epoch": 1.20415982484948, + "grad_norm": 0.39474231004714966, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 1100 + }, + { + "epoch": 1.2151067323481117, + "grad_norm": 0.3752822279930115, + "learning_rate": 0.0002, + "loss": 0.9852, + "step": 1110 + }, + { + "epoch": 1.2260536398467432, + "grad_norm": 0.4165991246700287, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 1120 + }, + { + "epoch": 1.237000547345375, + "grad_norm": 0.5326506495475769, + "learning_rate": 0.0002, + "loss": 1.0244, + "step": 1130 + }, + { + "epoch": 1.2479474548440066, + "grad_norm": 0.48845794796943665, + "learning_rate": 0.0002, + "loss": 1.0542, + "step": 1140 + }, + { + "epoch": 1.2588943623426383, + "grad_norm": 0.29910150170326233, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 1150 + }, + { + "epoch": 1.2698412698412698, + "grad_norm": 0.5069725513458252, + "learning_rate": 0.0002, + "loss": 1.2233, + "step": 1160 + }, + { + "epoch": 1.2807881773399015, + "grad_norm": 0.29500406980514526, + "learning_rate": 0.0002, + "loss": 1.0992, + "step": 1170 + }, + { + "epoch": 1.2917350848385332, + "grad_norm": 0.4711538851261139, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 1180 + }, + { + "epoch": 1.3026819923371646, + "grad_norm": 0.4203340709209442, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 1190 + }, + { + "epoch": 1.3136288998357963, + "grad_norm": 0.36101874709129333, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 1200 + }, + { + "epoch": 1.324575807334428, + "grad_norm": 0.4608800411224365, + "learning_rate": 0.0002, + "loss": 1.085, + "step": 1210 + }, + { + "epoch": 1.3355227148330597, + "grad_norm": 0.6570906639099121, + "learning_rate": 0.0002, + "loss": 1.0695, + "step": 1220 + }, + { + "epoch": 1.3464696223316914, + "grad_norm": 0.5352164506912231, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 1230 + }, + { + "epoch": 1.357416529830323, + "grad_norm": 0.3885001242160797, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 1240 + }, + { + "epoch": 1.3683634373289546, + "grad_norm": 0.2987913489341736, + "learning_rate": 0.0002, + "loss": 1.0611, + "step": 1250 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.42070427536964417, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 1260 + }, + { + "epoch": 1.3902572523262178, + "grad_norm": 0.5957782864570618, + "learning_rate": 0.0002, + "loss": 1.1092, + "step": 1270 + }, + { + "epoch": 1.4012041598248495, + "grad_norm": 0.32898882031440735, + "learning_rate": 0.0002, + "loss": 1.0554, + "step": 1280 + }, + { + "epoch": 1.4121510673234812, + "grad_norm": 0.27624452114105225, + "learning_rate": 0.0002, + "loss": 0.909, + "step": 1290 + }, + { + "epoch": 1.4230979748221126, + "grad_norm": 0.49570828676223755, + "learning_rate": 0.0002, + "loss": 1.024, + "step": 1300 + }, + { + "epoch": 1.4340448823207443, + "grad_norm": 0.26191383600234985, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 1310 + }, + { + "epoch": 1.444991789819376, + "grad_norm": 0.35664042830467224, + "learning_rate": 0.0002, + "loss": 1.0788, + "step": 1320 + }, + { + "epoch": 1.4559386973180077, + "grad_norm": 0.45126354694366455, + "learning_rate": 0.0002, + "loss": 1.0108, + "step": 1330 + }, + { + "epoch": 1.4668856048166394, + "grad_norm": 0.37318357825279236, + "learning_rate": 0.0002, + "loss": 0.9571, + "step": 1340 + }, + { + "epoch": 1.477832512315271, + "grad_norm": 0.6428970098495483, + "learning_rate": 0.0002, + "loss": 1.0507, + "step": 1350 + }, + { + "epoch": 1.4887794198139026, + "grad_norm": 0.43256187438964844, + "learning_rate": 0.0002, + "loss": 1.0367, + "step": 1360 + }, + { + "epoch": 1.4997263273125343, + "grad_norm": 0.5343793630599976, + "learning_rate": 0.0002, + "loss": 1.1321, + "step": 1370 + }, + { + "epoch": 1.5106732348111658, + "grad_norm": 0.315437376499176, + "learning_rate": 0.0002, + "loss": 1.1054, + "step": 1380 + }, + { + "epoch": 1.5216201423097975, + "grad_norm": 0.41561153531074524, + "learning_rate": 0.0002, + "loss": 0.8916, + "step": 1390 + }, + { + "epoch": 1.5325670498084292, + "grad_norm": 0.3201070725917816, + "learning_rate": 0.0002, + "loss": 1.0391, + "step": 1400 + }, + { + "epoch": 1.5435139573070606, + "grad_norm": 0.505537211894989, + "learning_rate": 0.0002, + "loss": 1.14, + "step": 1410 + }, + { + "epoch": 1.5544608648056923, + "grad_norm": 0.3747410178184509, + "learning_rate": 0.0002, + "loss": 1.0775, + "step": 1420 + }, + { + "epoch": 1.565407772304324, + "grad_norm": 0.49385908246040344, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 1430 + }, + { + "epoch": 1.5763546798029555, + "grad_norm": 0.49831628799438477, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 1440 + }, + { + "epoch": 1.5873015873015874, + "grad_norm": 0.372127890586853, + "learning_rate": 0.0002, + "loss": 1.0079, + "step": 1450 + }, + { + "epoch": 1.598248494800219, + "grad_norm": 0.40070840716362, + "learning_rate": 0.0002, + "loss": 0.9931, + "step": 1460 + }, + { + "epoch": 1.6091954022988506, + "grad_norm": 0.34907400608062744, + "learning_rate": 0.0002, + "loss": 0.8954, + "step": 1470 + }, + { + "epoch": 1.6201423097974823, + "grad_norm": 0.4632418751716614, + "learning_rate": 0.0002, + "loss": 0.9743, + "step": 1480 + }, + { + "epoch": 1.6310892172961138, + "grad_norm": 0.40164515376091003, + "learning_rate": 0.0002, + "loss": 1.0103, + "step": 1490 + }, + { + "epoch": 1.6420361247947455, + "grad_norm": 0.3214994966983795, + "learning_rate": 0.0002, + "loss": 0.9523, + "step": 1500 + }, + { + "epoch": 1.6529830322933772, + "grad_norm": 0.3727897107601166, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 1510 + }, + { + "epoch": 1.6639299397920086, + "grad_norm": 0.3817640542984009, + "learning_rate": 0.0002, + "loss": 1.0443, + "step": 1520 + }, + { + "epoch": 1.6748768472906403, + "grad_norm": 0.5592136979103088, + "learning_rate": 0.0002, + "loss": 1.0511, + "step": 1530 + }, + { + "epoch": 1.685823754789272, + "grad_norm": 0.44636598229408264, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 1540 + }, + { + "epoch": 1.6967706622879035, + "grad_norm": 0.40441709756851196, + "learning_rate": 0.0002, + "loss": 1.033, + "step": 1550 + }, + { + "epoch": 1.7077175697865354, + "grad_norm": 0.3243522644042969, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 1560 + }, + { + "epoch": 1.718664477285167, + "grad_norm": 0.34277570247650146, + "learning_rate": 0.0002, + "loss": 0.9482, + "step": 1570 + }, + { + "epoch": 1.7296113847837986, + "grad_norm": 0.3279995024204254, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 1580 + }, + { + "epoch": 1.7405582922824303, + "grad_norm": 0.41968777775764465, + "learning_rate": 0.0002, + "loss": 0.9414, + "step": 1590 + }, + { + "epoch": 1.7515051997810618, + "grad_norm": 0.39464613795280457, + "learning_rate": 0.0002, + "loss": 0.9768, + "step": 1600 + }, + { + "epoch": 1.7624521072796935, + "grad_norm": 0.3839009404182434, + "learning_rate": 0.0002, + "loss": 1.0347, + "step": 1610 + }, + { + "epoch": 1.7733990147783252, + "grad_norm": 0.3250715434551239, + "learning_rate": 0.0002, + "loss": 0.9195, + "step": 1620 + }, + { + "epoch": 1.7843459222769567, + "grad_norm": 0.5166561007499695, + "learning_rate": 0.0002, + "loss": 1.0049, + "step": 1630 + }, + { + "epoch": 1.7952928297755884, + "grad_norm": 0.4115183353424072, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 1640 + }, + { + "epoch": 1.80623973727422, + "grad_norm": 0.373780220746994, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 1650 + }, + { + "epoch": 1.8171866447728515, + "grad_norm": 0.49697014689445496, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 1660 + }, + { + "epoch": 1.8281335522714834, + "grad_norm": 1.0308938026428223, + "learning_rate": 0.0002, + "loss": 1.0089, + "step": 1670 + }, + { + "epoch": 1.839080459770115, + "grad_norm": 0.4851366877555847, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 1680 + }, + { + "epoch": 1.8500273672687466, + "grad_norm": 0.3262481391429901, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 1690 + }, + { + "epoch": 1.8609742747673783, + "grad_norm": 0.6904496550559998, + "learning_rate": 0.0002, + "loss": 1.048, + "step": 1700 + }, + { + "epoch": 1.8719211822660098, + "grad_norm": 0.49789851903915405, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 1710 + }, + { + "epoch": 1.8828680897646415, + "grad_norm": 0.3035794198513031, + "learning_rate": 0.0002, + "loss": 1.0258, + "step": 1720 + }, + { + "epoch": 1.8938149972632732, + "grad_norm": 0.4588414430618286, + "learning_rate": 0.0002, + "loss": 0.9916, + "step": 1730 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.4313034117221832, + "learning_rate": 0.0002, + "loss": 0.9526, + "step": 1740 + }, + { + "epoch": 1.9157088122605364, + "grad_norm": 0.38562044501304626, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 1750 + }, + { + "epoch": 1.926655719759168, + "grad_norm": 0.46947410702705383, + "learning_rate": 0.0002, + "loss": 0.9763, + "step": 1760 + }, + { + "epoch": 1.9376026272577995, + "grad_norm": 0.3848404884338379, + "learning_rate": 0.0002, + "loss": 0.9967, + "step": 1770 + }, + { + "epoch": 1.9485495347564314, + "grad_norm": 0.30422744154930115, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 1780 + }, + { + "epoch": 1.959496442255063, + "grad_norm": 0.41100990772247314, + "learning_rate": 0.0002, + "loss": 1.022, + "step": 1790 + }, + { + "epoch": 1.9704433497536946, + "grad_norm": 0.3492335081100464, + "learning_rate": 0.0002, + "loss": 1.003, + "step": 1800 + }, + { + "epoch": 1.9813902572523263, + "grad_norm": 0.364577978849411, + "learning_rate": 0.0002, + "loss": 0.9891, + "step": 1810 + }, + { + "epoch": 1.9923371647509578, + "grad_norm": 0.4312075674533844, + "learning_rate": 0.0002, + "loss": 1.0699, + "step": 1820 + }, + { + "epoch": 2.0, + "eval_loss": 1.14472496509552, + "eval_runtime": 46.0786, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 1.194, + "step": 1827 + }, + { + "epoch": 2.0032840722495897, + "grad_norm": 0.5989689230918884, + "learning_rate": 0.0002, + "loss": 1.0218, + "step": 1830 + }, + { + "epoch": 2.014230979748221, + "grad_norm": 0.49720922112464905, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 1840 + }, + { + "epoch": 2.0251778872468527, + "grad_norm": 0.42675456404685974, + "learning_rate": 0.0002, + "loss": 0.8482, + "step": 1850 + }, + { + "epoch": 2.0361247947454846, + "grad_norm": 0.4637208580970764, + "learning_rate": 0.0002, + "loss": 0.8487, + "step": 1860 + }, + { + "epoch": 2.047071702244116, + "grad_norm": 0.8329976797103882, + "learning_rate": 0.0002, + "loss": 0.8212, + "step": 1870 + }, + { + "epoch": 2.0580186097427475, + "grad_norm": 0.7869427800178528, + "learning_rate": 0.0002, + "loss": 0.9547, + "step": 1880 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.4927455186843872, + "learning_rate": 0.0002, + "loss": 0.9351, + "step": 1890 + }, + { + "epoch": 2.079912424740011, + "grad_norm": 0.6264246702194214, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 1900 + }, + { + "epoch": 2.0908593322386424, + "grad_norm": 1.1164122819900513, + "learning_rate": 0.0002, + "loss": 0.9095, + "step": 1910 + }, + { + "epoch": 2.1018062397372743, + "grad_norm": 0.5283981561660767, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 1920 + }, + { + "epoch": 2.112753147235906, + "grad_norm": 0.45621731877326965, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 1930 + }, + { + "epoch": 2.1237000547345373, + "grad_norm": 1.381791591644287, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 1940 + }, + { + "epoch": 2.134646962233169, + "grad_norm": 0.5151259899139404, + "learning_rate": 0.0002, + "loss": 0.9006, + "step": 1950 + }, + { + "epoch": 2.1455938697318007, + "grad_norm": 0.9806339740753174, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 1960 + }, + { + "epoch": 2.1565407772304326, + "grad_norm": 0.4734154939651489, + "learning_rate": 0.0002, + "loss": 0.8749, + "step": 1970 + }, + { + "epoch": 2.167487684729064, + "grad_norm": 0.9553168416023254, + "learning_rate": 0.0002, + "loss": 0.9172, + "step": 1980 + }, + { + "epoch": 2.1784345922276955, + "grad_norm": 0.5895838141441345, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1990 + }, + { + "epoch": 2.1893814997263275, + "grad_norm": 0.4488855302333832, + "learning_rate": 0.0002, + "loss": 0.7841, + "step": 2000 + }, + { + "epoch": 2.200328407224959, + "grad_norm": 1.0760235786437988, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 2010 + }, + { + "epoch": 2.2112753147235904, + "grad_norm": 0.5038785338401794, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 2020 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.59819495677948, + "learning_rate": 0.0002, + "loss": 0.8973, + "step": 2030 + }, + { + "epoch": 2.233169129720854, + "grad_norm": 0.5012075304985046, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 2040 + }, + { + "epoch": 2.2441160372194853, + "grad_norm": 0.44978439807891846, + "learning_rate": 0.0002, + "loss": 0.9087, + "step": 2050 + }, + { + "epoch": 2.255062944718117, + "grad_norm": 0.5350462198257446, + "learning_rate": 0.0002, + "loss": 0.823, + "step": 2060 + }, + { + "epoch": 2.2660098522167487, + "grad_norm": 0.6020669937133789, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 2070 + }, + { + "epoch": 2.2769567597153806, + "grad_norm": 0.5246821045875549, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 2080 + }, + { + "epoch": 2.287903667214012, + "grad_norm": 0.5711268782615662, + "learning_rate": 0.0002, + "loss": 0.8984, + "step": 2090 + }, + { + "epoch": 2.2988505747126435, + "grad_norm": 0.617317259311676, + "learning_rate": 0.0002, + "loss": 0.9093, + "step": 2100 + }, + { + "epoch": 2.3097974822112755, + "grad_norm": 0.8608947396278381, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 2110 + }, + { + "epoch": 2.320744389709907, + "grad_norm": 0.4739076793193817, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 2120 + }, + { + "epoch": 2.3316912972085384, + "grad_norm": 0.5538856983184814, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 2130 + }, + { + "epoch": 2.3426382047071703, + "grad_norm": 0.6064935326576233, + "learning_rate": 0.0002, + "loss": 0.8994, + "step": 2140 + }, + { + "epoch": 2.353585112205802, + "grad_norm": 0.5019068121910095, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 2150 + }, + { + "epoch": 2.3645320197044333, + "grad_norm": 0.45340514183044434, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 2160 + }, + { + "epoch": 2.375478927203065, + "grad_norm": 0.7347203493118286, + "learning_rate": 0.0002, + "loss": 0.9254, + "step": 2170 + }, + { + "epoch": 2.3864258347016967, + "grad_norm": 0.46922534704208374, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 2180 + }, + { + "epoch": 2.3973727422003286, + "grad_norm": 0.5507845878601074, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2190 + }, + { + "epoch": 2.40831964969896, + "grad_norm": 0.5621911883354187, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2200 + }, + { + "epoch": 2.4192665571975915, + "grad_norm": 0.5023514032363892, + "learning_rate": 0.0002, + "loss": 0.8569, + "step": 2210 + }, + { + "epoch": 2.4302134646962235, + "grad_norm": 0.6124861240386963, + "learning_rate": 0.0002, + "loss": 0.8355, + "step": 2220 + }, + { + "epoch": 2.441160372194855, + "grad_norm": 0.49614205956459045, + "learning_rate": 0.0002, + "loss": 0.885, + "step": 2230 + }, + { + "epoch": 2.4521072796934864, + "grad_norm": 0.6477900743484497, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 2240 + }, + { + "epoch": 2.4630541871921183, + "grad_norm": 0.5868843793869019, + "learning_rate": 0.0002, + "loss": 0.8622, + "step": 2250 + }, + { + "epoch": 2.47400109469075, + "grad_norm": 0.4364610016345978, + "learning_rate": 0.0002, + "loss": 0.8498, + "step": 2260 + }, + { + "epoch": 2.4849480021893813, + "grad_norm": 0.5792964696884155, + "learning_rate": 0.0002, + "loss": 0.8378, + "step": 2270 + }, + { + "epoch": 2.495894909688013, + "grad_norm": 0.5421269536018372, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 2280 + }, + { + "epoch": 2.5068418171866447, + "grad_norm": 0.5525493025779724, + "learning_rate": 0.0002, + "loss": 0.9637, + "step": 2290 + }, + { + "epoch": 2.5177887246852766, + "grad_norm": 0.6463850140571594, + "learning_rate": 0.0002, + "loss": 0.8075, + "step": 2300 + }, + { + "epoch": 2.528735632183908, + "grad_norm": 0.6861311793327332, + "learning_rate": 0.0002, + "loss": 0.8591, + "step": 2310 + }, + { + "epoch": 2.5396825396825395, + "grad_norm": 0.5563555359840393, + "learning_rate": 0.0002, + "loss": 0.9287, + "step": 2320 + }, + { + "epoch": 2.5506294471811715, + "grad_norm": 0.5721169114112854, + "learning_rate": 0.0002, + "loss": 0.945, + "step": 2330 + }, + { + "epoch": 2.561576354679803, + "grad_norm": 0.5258274674415588, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 2340 + }, + { + "epoch": 2.572523262178435, + "grad_norm": 0.7057380676269531, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 2350 + }, + { + "epoch": 2.5834701696770663, + "grad_norm": 0.6869027614593506, + "learning_rate": 0.0002, + "loss": 0.8615, + "step": 2360 + }, + { + "epoch": 2.594417077175698, + "grad_norm": 0.4960809648036957, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2370 + }, + { + "epoch": 2.6053639846743293, + "grad_norm": 0.9288380146026611, + "learning_rate": 0.0002, + "loss": 0.8476, + "step": 2380 + }, + { + "epoch": 2.616310892172961, + "grad_norm": 0.3765334188938141, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 2390 + }, + { + "epoch": 2.6272577996715927, + "grad_norm": 0.7487865686416626, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 2400 + }, + { + "epoch": 2.6382047071702246, + "grad_norm": 0.6141156554222107, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 2410 + }, + { + "epoch": 2.649151614668856, + "grad_norm": 0.8420507907867432, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 2420 + }, + { + "epoch": 2.6600985221674875, + "grad_norm": 0.53386390209198, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 2430 + }, + { + "epoch": 2.6710454296661195, + "grad_norm": 0.5520607233047485, + "learning_rate": 0.0002, + "loss": 0.8486, + "step": 2440 + }, + { + "epoch": 2.681992337164751, + "grad_norm": 0.5337599515914917, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 2450 + }, + { + "epoch": 2.692939244663383, + "grad_norm": 0.48790836334228516, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 2460 + }, + { + "epoch": 2.7038861521620143, + "grad_norm": 0.8287786245346069, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 2470 + }, + { + "epoch": 2.714833059660646, + "grad_norm": 0.5876168608665466, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 2480 + }, + { + "epoch": 2.7257799671592773, + "grad_norm": 0.5206760764122009, + "learning_rate": 0.0002, + "loss": 0.8773, + "step": 2490 + }, + { + "epoch": 2.736726874657909, + "grad_norm": 0.5619136691093445, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 2500 + }, + { + "epoch": 2.7476737821565407, + "grad_norm": 0.5614883899688721, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 2510 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.6157700419425964, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2520 + }, + { + "epoch": 2.769567597153804, + "grad_norm": 0.5529953837394714, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 2530 + }, + { + "epoch": 2.7805145046524355, + "grad_norm": 0.6731224060058594, + "learning_rate": 0.0002, + "loss": 0.8881, + "step": 2540 + }, + { + "epoch": 2.7914614121510675, + "grad_norm": 0.6960386633872986, + "learning_rate": 0.0002, + "loss": 0.9125, + "step": 2550 + }, + { + "epoch": 2.802408319649699, + "grad_norm": 0.5203493237495422, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 2560 + }, + { + "epoch": 2.813355227148331, + "grad_norm": 1.036837100982666, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 2570 + }, + { + "epoch": 2.8243021346469623, + "grad_norm": 0.6125805377960205, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 2580 + }, + { + "epoch": 2.835249042145594, + "grad_norm": 0.6298092603683472, + "learning_rate": 0.0002, + "loss": 0.8939, + "step": 2590 + }, + { + "epoch": 2.8461959496442253, + "grad_norm": 0.5882203578948975, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2600 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.8619399666786194, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 2610 + }, + { + "epoch": 2.8680897646414887, + "grad_norm": 0.4722687304019928, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 2620 + }, + { + "epoch": 2.8790366721401206, + "grad_norm": 0.47399574518203735, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 2630 + }, + { + "epoch": 2.889983579638752, + "grad_norm": 0.5639172792434692, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2640 + }, + { + "epoch": 2.9009304871373836, + "grad_norm": 0.4676816761493683, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 2650 + }, + { + "epoch": 2.9118773946360155, + "grad_norm": 0.6906291246414185, + "learning_rate": 0.0002, + "loss": 0.8966, + "step": 2660 + }, + { + "epoch": 2.922824302134647, + "grad_norm": 0.4369746148586273, + "learning_rate": 0.0002, + "loss": 0.8966, + "step": 2670 + }, + { + "epoch": 2.933771209633279, + "grad_norm": 0.46423083543777466, + "learning_rate": 0.0002, + "loss": 0.9173, + "step": 2680 + }, + { + "epoch": 2.9447181171319103, + "grad_norm": 0.5700525045394897, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 2690 + }, + { + "epoch": 2.955665024630542, + "grad_norm": 0.6221476793289185, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 2700 + }, + { + "epoch": 2.9666119321291733, + "grad_norm": 0.6102682948112488, + "learning_rate": 0.0002, + "loss": 0.8538, + "step": 2710 + }, + { + "epoch": 2.977558839627805, + "grad_norm": 0.5317878723144531, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 2720 + }, + { + "epoch": 2.9885057471264367, + "grad_norm": 0.4438510835170746, + "learning_rate": 0.0002, + "loss": 0.9314, + "step": 2730 + }, + { + "epoch": 2.9994526546250686, + "grad_norm": 0.5022130012512207, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 2740 + }, + { + "epoch": 2.9994526546250686, + "eval_loss": 1.1722838878631592, + "eval_runtime": 46.0829, + "eval_samples_per_second": 9.461, + "eval_steps_per_second": 1.194, + "step": 2740 + }, + { + "epoch": 3.0103995621237, + "grad_norm": 0.6384502053260803, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 2750 + }, + { + "epoch": 3.0213464696223316, + "grad_norm": 0.9928722381591797, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 2760 + }, + { + "epoch": 3.0322933771209635, + "grad_norm": 0.7813051342964172, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 2770 + }, + { + "epoch": 3.043240284619595, + "grad_norm": 1.0202556848526, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 2780 + }, + { + "epoch": 3.0541871921182264, + "grad_norm": 0.7581062316894531, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 2790 + }, + { + "epoch": 3.0651340996168583, + "grad_norm": 0.6252710223197937, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 2800 + }, + { + "epoch": 3.07608100711549, + "grad_norm": 0.7738662958145142, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 2810 + }, + { + "epoch": 3.0870279146141213, + "grad_norm": 0.7381885051727295, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 2820 + }, + { + "epoch": 3.097974822112753, + "grad_norm": 0.9197564721107483, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 2830 + }, + { + "epoch": 3.1089217296113847, + "grad_norm": 1.000976800918579, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 2840 + }, + { + "epoch": 3.1198686371100166, + "grad_norm": 0.7559131383895874, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 2850 + }, + { + "epoch": 3.130815544608648, + "grad_norm": 0.7213780879974365, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 2860 + }, + { + "epoch": 3.1417624521072796, + "grad_norm": 0.945939838886261, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 2870 + }, + { + "epoch": 3.1527093596059115, + "grad_norm": 0.7277454137802124, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 2880 + }, + { + "epoch": 3.163656267104543, + "grad_norm": 0.762026846408844, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 2890 + }, + { + "epoch": 3.1746031746031744, + "grad_norm": 0.6471221446990967, + "learning_rate": 0.0002, + "loss": 0.5332, + "step": 2900 + }, + { + "epoch": 3.1855500821018063, + "grad_norm": 0.6018978357315063, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 2910 + }, + { + "epoch": 3.196496989600438, + "grad_norm": 0.8607320785522461, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 2920 + }, + { + "epoch": 3.2074438970990693, + "grad_norm": 0.8854126334190369, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 2930 + }, + { + "epoch": 3.218390804597701, + "grad_norm": 0.6620870232582092, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 2940 + }, + { + "epoch": 3.2293377120963327, + "grad_norm": 0.7377511858940125, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 2950 + }, + { + "epoch": 3.2402846195949646, + "grad_norm": 0.7803301811218262, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 2960 + }, + { + "epoch": 3.251231527093596, + "grad_norm": 0.834061861038208, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 2970 + }, + { + "epoch": 3.2621784345922276, + "grad_norm": 0.8496041893959045, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 2980 + }, + { + "epoch": 3.2731253420908595, + "grad_norm": 0.7967984676361084, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 2990 + }, + { + "epoch": 3.284072249589491, + "grad_norm": 1.0207016468048096, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 3000 + }, + { + "epoch": 3.2950191570881224, + "grad_norm": 0.7855771780014038, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 3010 + }, + { + "epoch": 3.3059660645867543, + "grad_norm": 0.9427546858787537, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 3020 + }, + { + "epoch": 3.316912972085386, + "grad_norm": 0.823542594909668, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 3030 + }, + { + "epoch": 3.3278598795840173, + "grad_norm": 0.9826635122299194, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 3040 + }, + { + "epoch": 3.338806787082649, + "grad_norm": 0.7259827852249146, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 3050 + }, + { + "epoch": 3.3497536945812807, + "grad_norm": 0.7774739861488342, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 3060 + }, + { + "epoch": 3.3607006020799126, + "grad_norm": 0.7394293546676636, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 3070 + }, + { + "epoch": 3.371647509578544, + "grad_norm": 0.9017578959465027, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 3080 + }, + { + "epoch": 3.3825944170771756, + "grad_norm": 0.7451054453849792, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 3090 + }, + { + "epoch": 3.3935413245758075, + "grad_norm": 0.7321506142616272, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 3100 + }, + { + "epoch": 3.404488232074439, + "grad_norm": 0.6721828579902649, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 3110 + }, + { + "epoch": 3.4154351395730704, + "grad_norm": 0.774022102355957, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 3120 + }, + { + "epoch": 3.4263820470717024, + "grad_norm": 0.9143537282943726, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 3130 + }, + { + "epoch": 3.437328954570334, + "grad_norm": 1.226087212562561, + "learning_rate": 0.0002, + "loss": 0.6899, + "step": 3140 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 0.7545496225357056, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 3150 + }, + { + "epoch": 3.4592227695675972, + "grad_norm": 0.6515635848045349, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 3160 + }, + { + "epoch": 3.4701696770662287, + "grad_norm": 0.9297090172767639, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 3170 + }, + { + "epoch": 3.4811165845648606, + "grad_norm": 1.0130730867385864, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 3180 + }, + { + "epoch": 3.492063492063492, + "grad_norm": 0.7654589414596558, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 3190 + }, + { + "epoch": 3.5030103995621236, + "grad_norm": 0.9954977631568909, + "learning_rate": 0.0002, + "loss": 0.7401, + "step": 3200 + }, + { + "epoch": 3.5139573070607555, + "grad_norm": 0.6027487516403198, + "learning_rate": 0.0002, + "loss": 0.6661, + "step": 3210 + }, + { + "epoch": 3.524904214559387, + "grad_norm": 0.741770327091217, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 3220 + }, + { + "epoch": 3.535851122058019, + "grad_norm": 1.0534909963607788, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 3230 + }, + { + "epoch": 3.5467980295566504, + "grad_norm": 0.937772274017334, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 3240 + }, + { + "epoch": 3.557744937055282, + "grad_norm": 0.8504213690757751, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 3250 + }, + { + "epoch": 3.5686918445539133, + "grad_norm": 0.7755007147789001, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 3260 + }, + { + "epoch": 3.5796387520525452, + "grad_norm": 1.0193358659744263, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 3270 + }, + { + "epoch": 3.5905856595511767, + "grad_norm": 0.8440536856651306, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 3280 + }, + { + "epoch": 3.6015325670498086, + "grad_norm": 0.6195939183235168, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 3290 + }, + { + "epoch": 3.61247947454844, + "grad_norm": 0.8608590960502625, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 3300 + }, + { + "epoch": 3.6234263820470716, + "grad_norm": 0.6772327423095703, + "learning_rate": 0.0002, + "loss": 0.7301, + "step": 3310 + }, + { + "epoch": 3.6343732895457035, + "grad_norm": 0.8031839728355408, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 3320 + }, + { + "epoch": 3.645320197044335, + "grad_norm": 0.6080502271652222, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 3330 + }, + { + "epoch": 3.656267104542967, + "grad_norm": 0.8007240891456604, + "learning_rate": 0.0002, + "loss": 0.7431, + "step": 3340 + }, + { + "epoch": 3.6672140120415984, + "grad_norm": 0.8060704469680786, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 3350 + }, + { + "epoch": 3.67816091954023, + "grad_norm": 0.7547586560249329, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 3360 + }, + { + "epoch": 3.6891078270388613, + "grad_norm": 0.686851978302002, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 3370 + }, + { + "epoch": 3.7000547345374932, + "grad_norm": 0.9429075717926025, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 3380 + }, + { + "epoch": 3.7110016420361247, + "grad_norm": 0.7283591032028198, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 3390 + }, + { + "epoch": 3.7219485495347566, + "grad_norm": 0.8323085904121399, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 3400 + }, + { + "epoch": 3.732895457033388, + "grad_norm": 0.8529590964317322, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 3410 + }, + { + "epoch": 3.7438423645320196, + "grad_norm": 0.731752872467041, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 3420 + }, + { + "epoch": 3.7547892720306515, + "grad_norm": 0.8572278618812561, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3430 + }, + { + "epoch": 3.765736179529283, + "grad_norm": 0.7408691048622131, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 3440 + }, + { + "epoch": 3.776683087027915, + "grad_norm": 0.7470445036888123, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 3450 + }, + { + "epoch": 3.7876299945265464, + "grad_norm": 0.6806244254112244, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 3460 + }, + { + "epoch": 3.798576902025178, + "grad_norm": 0.9129069447517395, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 3470 + }, + { + "epoch": 3.8095238095238093, + "grad_norm": 0.8717501759529114, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 3480 + }, + { + "epoch": 3.8204707170224412, + "grad_norm": 0.6761979460716248, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 3490 + }, + { + "epoch": 3.8314176245210727, + "grad_norm": 1.0054380893707275, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 3500 + }, + { + "epoch": 3.8423645320197046, + "grad_norm": 1.1224009990692139, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 3510 + }, + { + "epoch": 3.853311439518336, + "grad_norm": 0.8997692465782166, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 3520 + }, + { + "epoch": 3.8642583470169676, + "grad_norm": 1.0086902379989624, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 3530 + }, + { + "epoch": 3.8752052545155995, + "grad_norm": 0.772739589214325, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 3540 + }, + { + "epoch": 3.886152162014231, + "grad_norm": 1.211774230003357, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 3550 + }, + { + "epoch": 3.897099069512863, + "grad_norm": 0.9572356939315796, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 3560 + }, + { + "epoch": 3.9080459770114944, + "grad_norm": 0.7887842655181885, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 3570 + }, + { + "epoch": 3.918992884510126, + "grad_norm": 0.7308389544487, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 3580 + }, + { + "epoch": 3.9299397920087573, + "grad_norm": 1.0182650089263916, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 3590 + }, + { + "epoch": 3.9408866995073892, + "grad_norm": 0.8000147342681885, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 3600 + }, + { + "epoch": 3.9518336070060207, + "grad_norm": 0.7385728359222412, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 3610 + }, + { + "epoch": 3.9627805145046526, + "grad_norm": 0.9233261942863464, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 3620 + }, + { + "epoch": 3.973727422003284, + "grad_norm": 0.8486751914024353, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 3630 + }, + { + "epoch": 3.9846743295019156, + "grad_norm": 0.7593663334846497, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3640 + }, + { + "epoch": 3.9956212370005475, + "grad_norm": 0.7885415554046631, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 3650 + }, + { + "epoch": 4.0, + "eval_loss": 1.250312328338623, + "eval_runtime": 46.0842, + "eval_samples_per_second": 9.461, + "eval_steps_per_second": 1.193, + "step": 3654 + }, + { + "epoch": 4.006568144499179, + "grad_norm": 0.6591703295707703, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 3660 + }, + { + "epoch": 4.017515051997811, + "grad_norm": 1.36927330493927, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 3670 + }, + { + "epoch": 4.028461959496442, + "grad_norm": 0.8106328845024109, + "learning_rate": 0.0002, + "loss": 0.4466, + "step": 3680 + }, + { + "epoch": 4.039408866995074, + "grad_norm": 0.7592712044715881, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 3690 + }, + { + "epoch": 4.050355774493705, + "grad_norm": 0.9518909454345703, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 3700 + }, + { + "epoch": 4.061302681992337, + "grad_norm": 0.7805967330932617, + "learning_rate": 0.0002, + "loss": 0.4638, + "step": 3710 + }, + { + "epoch": 4.072249589490969, + "grad_norm": 1.3146334886550903, + "learning_rate": 0.0002, + "loss": 0.4556, + "step": 3720 + }, + { + "epoch": 4.083196496989601, + "grad_norm": 1.1611138582229614, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 3730 + }, + { + "epoch": 4.094143404488232, + "grad_norm": 0.8173232078552246, + "learning_rate": 0.0002, + "loss": 0.3845, + "step": 3740 + }, + { + "epoch": 4.105090311986864, + "grad_norm": 0.7848323583602905, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 3750 + }, + { + "epoch": 4.116037219485495, + "grad_norm": 1.3183201551437378, + "learning_rate": 0.0002, + "loss": 0.4519, + "step": 3760 + }, + { + "epoch": 4.1269841269841265, + "grad_norm": 1.1936529874801636, + "learning_rate": 0.0002, + "loss": 0.5083, + "step": 3770 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 1.1078993082046509, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 3780 + }, + { + "epoch": 4.14887794198139, + "grad_norm": 1.107743263244629, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 3790 + }, + { + "epoch": 4.159824849480022, + "grad_norm": 0.7801875472068787, + "learning_rate": 0.0002, + "loss": 0.5112, + "step": 3800 + }, + { + "epoch": 4.170771756978653, + "grad_norm": 1.1328117847442627, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 3810 + }, + { + "epoch": 4.181718664477285, + "grad_norm": 1.4232193231582642, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 3820 + }, + { + "epoch": 4.192665571975917, + "grad_norm": 1.557416558265686, + "learning_rate": 0.0002, + "loss": 0.5049, + "step": 3830 + }, + { + "epoch": 4.203612479474549, + "grad_norm": 1.042923092842102, + "learning_rate": 0.0002, + "loss": 0.4863, + "step": 3840 + }, + { + "epoch": 4.21455938697318, + "grad_norm": 1.1801949739456177, + "learning_rate": 0.0002, + "loss": 0.3751, + "step": 3850 + }, + { + "epoch": 4.225506294471812, + "grad_norm": 0.9273753762245178, + "learning_rate": 0.0002, + "loss": 0.5063, + "step": 3860 + }, + { + "epoch": 4.236453201970443, + "grad_norm": 0.7681763768196106, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 3870 + }, + { + "epoch": 4.2474001094690745, + "grad_norm": 0.9840841293334961, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 3880 + }, + { + "epoch": 4.258347016967707, + "grad_norm": 1.0290725231170654, + "learning_rate": 0.0002, + "loss": 0.4648, + "step": 3890 + }, + { + "epoch": 4.269293924466338, + "grad_norm": 0.8059597611427307, + "learning_rate": 0.0002, + "loss": 0.4288, + "step": 3900 + }, + { + "epoch": 4.28024083196497, + "grad_norm": 0.9847467541694641, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 3910 + }, + { + "epoch": 4.291187739463601, + "grad_norm": 1.344044804573059, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 3920 + }, + { + "epoch": 4.302134646962233, + "grad_norm": 0.9174224138259888, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 3930 + }, + { + "epoch": 4.313081554460865, + "grad_norm": 1.1199711561203003, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 3940 + }, + { + "epoch": 4.324028461959497, + "grad_norm": 1.0120296478271484, + "learning_rate": 0.0002, + "loss": 0.4641, + "step": 3950 + }, + { + "epoch": 4.334975369458128, + "grad_norm": 1.091811180114746, + "learning_rate": 0.0002, + "loss": 0.4723, + "step": 3960 + }, + { + "epoch": 4.34592227695676, + "grad_norm": 1.0332133769989014, + "learning_rate": 0.0002, + "loss": 0.4627, + "step": 3970 + }, + { + "epoch": 4.356869184455391, + "grad_norm": 1.0785295963287354, + "learning_rate": 0.0002, + "loss": 0.4646, + "step": 3980 + }, + { + "epoch": 4.3678160919540225, + "grad_norm": 1.0506969690322876, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 3990 + }, + { + "epoch": 4.378762999452655, + "grad_norm": 1.047560691833496, + "learning_rate": 0.0002, + "loss": 0.4776, + "step": 4000 + }, + { + "epoch": 4.389709906951286, + "grad_norm": 0.9348800778388977, + "learning_rate": 0.0002, + "loss": 0.4549, + "step": 4010 + }, + { + "epoch": 4.400656814449918, + "grad_norm": 1.1563059091567993, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 4020 + }, + { + "epoch": 4.411603721948549, + "grad_norm": 1.001470923423767, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 4030 + }, + { + "epoch": 4.422550629447181, + "grad_norm": 1.309012532234192, + "learning_rate": 0.0002, + "loss": 0.4972, + "step": 4040 + }, + { + "epoch": 4.433497536945813, + "grad_norm": 0.7338925004005432, + "learning_rate": 0.0002, + "loss": 0.5078, + "step": 4050 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 1.0398834943771362, + "learning_rate": 0.0002, + "loss": 0.4632, + "step": 4060 + }, + { + "epoch": 4.455391351943076, + "grad_norm": 0.9728689193725586, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 4070 + }, + { + "epoch": 4.466338259441708, + "grad_norm": 1.247475028038025, + "learning_rate": 0.0002, + "loss": 0.4778, + "step": 4080 + }, + { + "epoch": 4.477285166940339, + "grad_norm": 1.1084578037261963, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 4090 + }, + { + "epoch": 4.4882320744389705, + "grad_norm": 1.1619318723678589, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 4100 + }, + { + "epoch": 4.499178981937603, + "grad_norm": 1.3456498384475708, + "learning_rate": 0.0002, + "loss": 0.5207, + "step": 4110 + }, + { + "epoch": 4.510125889436234, + "grad_norm": 0.9372991323471069, + "learning_rate": 0.0002, + "loss": 0.4876, + "step": 4120 + }, + { + "epoch": 4.521072796934866, + "grad_norm": 1.0071815252304077, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 4130 + }, + { + "epoch": 4.532019704433497, + "grad_norm": 1.190344214439392, + "learning_rate": 0.0002, + "loss": 0.5589, + "step": 4140 + }, + { + "epoch": 4.542966611932129, + "grad_norm": 0.9480887055397034, + "learning_rate": 0.0002, + "loss": 0.4852, + "step": 4150 + }, + { + "epoch": 4.553913519430761, + "grad_norm": 1.0252189636230469, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 4160 + }, + { + "epoch": 4.564860426929393, + "grad_norm": 0.7142013311386108, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 4170 + }, + { + "epoch": 4.575807334428024, + "grad_norm": 0.8937426805496216, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 4180 + }, + { + "epoch": 4.586754241926656, + "grad_norm": 0.8885005116462708, + "learning_rate": 0.0002, + "loss": 0.4773, + "step": 4190 + }, + { + "epoch": 4.597701149425287, + "grad_norm": 1.337663173675537, + "learning_rate": 0.0002, + "loss": 0.4858, + "step": 4200 + }, + { + "epoch": 4.6086480569239185, + "grad_norm": 1.0475375652313232, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 4210 + }, + { + "epoch": 4.619594964422551, + "grad_norm": 1.0081088542938232, + "learning_rate": 0.0002, + "loss": 0.5298, + "step": 4220 + }, + { + "epoch": 4.630541871921182, + "grad_norm": 0.7527595162391663, + "learning_rate": 0.0002, + "loss": 0.5042, + "step": 4230 + }, + { + "epoch": 4.641488779419814, + "grad_norm": 1.55559241771698, + "learning_rate": 0.0002, + "loss": 0.5207, + "step": 4240 + }, + { + "epoch": 4.652435686918445, + "grad_norm": 0.7967379689216614, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 4250 + }, + { + "epoch": 4.663382594417077, + "grad_norm": 0.898368775844574, + "learning_rate": 0.0002, + "loss": 0.5328, + "step": 4260 + }, + { + "epoch": 4.674329501915709, + "grad_norm": 1.1940776109695435, + "learning_rate": 0.0002, + "loss": 0.4706, + "step": 4270 + }, + { + "epoch": 4.685276409414341, + "grad_norm": 1.1817092895507812, + "learning_rate": 0.0002, + "loss": 0.5121, + "step": 4280 + }, + { + "epoch": 4.696223316912972, + "grad_norm": 0.9041520357131958, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 4290 + }, + { + "epoch": 4.707170224411604, + "grad_norm": 1.1280102729797363, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 4300 + }, + { + "epoch": 4.718117131910235, + "grad_norm": 1.357689619064331, + "learning_rate": 0.0002, + "loss": 0.4891, + "step": 4310 + }, + { + "epoch": 4.7290640394088665, + "grad_norm": 1.056633472442627, + "learning_rate": 0.0002, + "loss": 0.4704, + "step": 4320 + }, + { + "epoch": 4.740010946907499, + "grad_norm": 1.6520427465438843, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 4330 + }, + { + "epoch": 4.75095785440613, + "grad_norm": 1.153200626373291, + "learning_rate": 0.0002, + "loss": 0.5131, + "step": 4340 + }, + { + "epoch": 4.761904761904762, + "grad_norm": 0.9346241354942322, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 4350 + }, + { + "epoch": 4.772851669403393, + "grad_norm": 0.8628455996513367, + "learning_rate": 0.0002, + "loss": 0.4941, + "step": 4360 + }, + { + "epoch": 4.783798576902025, + "grad_norm": 1.3843916654586792, + "learning_rate": 0.0002, + "loss": 0.5167, + "step": 4370 + }, + { + "epoch": 4.794745484400657, + "grad_norm": 1.035574197769165, + "learning_rate": 0.0002, + "loss": 0.4683, + "step": 4380 + }, + { + "epoch": 4.805692391899289, + "grad_norm": 1.1868361234664917, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 4390 + }, + { + "epoch": 4.81663929939792, + "grad_norm": 1.1307647228240967, + "learning_rate": 0.0002, + "loss": 0.534, + "step": 4400 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 0.9787724614143372, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 4410 + }, + { + "epoch": 4.838533114395183, + "grad_norm": 1.0473824739456177, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 4420 + }, + { + "epoch": 4.8494800218938146, + "grad_norm": 1.069069504737854, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 4430 + }, + { + "epoch": 4.860426929392447, + "grad_norm": 1.4305680990219116, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 4440 + }, + { + "epoch": 4.871373836891078, + "grad_norm": 1.3679203987121582, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 4450 + }, + { + "epoch": 4.88232074438971, + "grad_norm": 0.8997844457626343, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 4460 + }, + { + "epoch": 4.893267651888341, + "grad_norm": 1.2758110761642456, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 4470 + }, + { + "epoch": 4.904214559386973, + "grad_norm": 0.8819465637207031, + "learning_rate": 0.0002, + "loss": 0.4914, + "step": 4480 + }, + { + "epoch": 4.915161466885605, + "grad_norm": 1.08329439163208, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 4490 + }, + { + "epoch": 4.926108374384237, + "grad_norm": 1.083461046218872, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 4500 + }, + { + "epoch": 4.937055281882868, + "grad_norm": 1.2387723922729492, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 4510 + }, + { + "epoch": 4.9480021893815, + "grad_norm": 0.8262293934822083, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 4520 + }, + { + "epoch": 4.958949096880131, + "grad_norm": 1.2325191497802734, + "learning_rate": 0.0002, + "loss": 0.504, + "step": 4530 + }, + { + "epoch": 4.9698960043787626, + "grad_norm": 1.024614930152893, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 4540 + }, + { + "epoch": 4.980842911877395, + "grad_norm": 1.3007521629333496, + "learning_rate": 0.0002, + "loss": 0.4752, + "step": 4550 + }, + { + "epoch": 4.991789819376026, + "grad_norm": 0.9823828339576721, + "learning_rate": 0.0002, + "loss": 0.4943, + "step": 4560 + }, + { + "epoch": 4.999452654625069, + "eval_loss": 1.3920727968215942, + "eval_runtime": 46.0764, + "eval_samples_per_second": 9.463, + "eval_steps_per_second": 1.194, + "step": 4567 + }, + { + "epoch": 5.002736726874658, + "grad_norm": 1.1478906869888306, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 4570 + }, + { + "epoch": 5.013683634373289, + "grad_norm": 1.0533705949783325, + "learning_rate": 0.0002, + "loss": 0.372, + "step": 4580 + }, + { + "epoch": 5.024630541871921, + "grad_norm": 1.268900752067566, + "learning_rate": 0.0002, + "loss": 0.3313, + "step": 4590 + }, + { + "epoch": 5.035577449370553, + "grad_norm": 1.222652554512024, + "learning_rate": 0.0002, + "loss": 0.3482, + "step": 4600 + }, + { + "epoch": 5.046524356869185, + "grad_norm": 1.5093127489089966, + "learning_rate": 0.0002, + "loss": 0.3195, + "step": 4610 + }, + { + "epoch": 5.057471264367816, + "grad_norm": 1.2372499704360962, + "learning_rate": 0.0002, + "loss": 0.3569, + "step": 4620 + }, + { + "epoch": 5.068418171866448, + "grad_norm": 0.8422666192054749, + "learning_rate": 0.0002, + "loss": 0.3206, + "step": 4630 + }, + { + "epoch": 5.079365079365079, + "grad_norm": 1.1451770067214966, + "learning_rate": 0.0002, + "loss": 0.3115, + "step": 4640 + }, + { + "epoch": 5.090311986863711, + "grad_norm": 1.2074557542800903, + "learning_rate": 0.0002, + "loss": 0.3305, + "step": 4650 + }, + { + "epoch": 5.101258894362343, + "grad_norm": 1.429150104522705, + "learning_rate": 0.0002, + "loss": 0.3012, + "step": 4660 + }, + { + "epoch": 5.112205801860974, + "grad_norm": 1.0353610515594482, + "learning_rate": 0.0002, + "loss": 0.3229, + "step": 4670 + }, + { + "epoch": 5.123152709359606, + "grad_norm": 1.2845979928970337, + "learning_rate": 0.0002, + "loss": 0.402, + "step": 4680 + }, + { + "epoch": 5.134099616858237, + "grad_norm": 1.3790186643600464, + "learning_rate": 0.0002, + "loss": 0.383, + "step": 4690 + }, + { + "epoch": 5.145046524356869, + "grad_norm": 1.3182239532470703, + "learning_rate": 0.0002, + "loss": 0.2951, + "step": 4700 + }, + { + "epoch": 5.155993431855501, + "grad_norm": 1.5249626636505127, + "learning_rate": 0.0002, + "loss": 0.4074, + "step": 4710 + }, + { + "epoch": 5.166940339354133, + "grad_norm": 1.2492733001708984, + "learning_rate": 0.0002, + "loss": 0.3703, + "step": 4720 + }, + { + "epoch": 5.177887246852764, + "grad_norm": 1.4455480575561523, + "learning_rate": 0.0002, + "loss": 0.3411, + "step": 4730 + }, + { + "epoch": 5.188834154351396, + "grad_norm": 1.2191482782363892, + "learning_rate": 0.0002, + "loss": 0.3996, + "step": 4740 + }, + { + "epoch": 5.199781061850027, + "grad_norm": 1.4707951545715332, + "learning_rate": 0.0002, + "loss": 0.3785, + "step": 4750 + }, + { + "epoch": 5.210727969348659, + "grad_norm": 1.3473678827285767, + "learning_rate": 0.0002, + "loss": 0.3516, + "step": 4760 + }, + { + "epoch": 5.221674876847291, + "grad_norm": 1.0479670763015747, + "learning_rate": 0.0002, + "loss": 0.3266, + "step": 4770 + }, + { + "epoch": 5.232621784345922, + "grad_norm": 1.299096703529358, + "learning_rate": 0.0002, + "loss": 0.3976, + "step": 4780 + }, + { + "epoch": 5.243568691844554, + "grad_norm": 1.2820168733596802, + "learning_rate": 0.0002, + "loss": 0.3266, + "step": 4790 + }, + { + "epoch": 5.254515599343185, + "grad_norm": 1.3818004131317139, + "learning_rate": 0.0002, + "loss": 0.3347, + "step": 4800 + }, + { + "epoch": 5.265462506841817, + "grad_norm": 1.2898736000061035, + "learning_rate": 0.0002, + "loss": 0.3761, + "step": 4810 + }, + { + "epoch": 5.276409414340449, + "grad_norm": 1.1761468648910522, + "learning_rate": 0.0002, + "loss": 0.3694, + "step": 4820 + }, + { + "epoch": 5.287356321839081, + "grad_norm": 1.7155952453613281, + "learning_rate": 0.0002, + "loss": 0.3806, + "step": 4830 + }, + { + "epoch": 5.298303229337712, + "grad_norm": 0.9103642106056213, + "learning_rate": 0.0002, + "loss": 0.322, + "step": 4840 + }, + { + "epoch": 5.309250136836344, + "grad_norm": 1.013015627861023, + "learning_rate": 0.0002, + "loss": 0.3516, + "step": 4850 + }, + { + "epoch": 5.320197044334975, + "grad_norm": 1.390471339225769, + "learning_rate": 0.0002, + "loss": 0.4297, + "step": 4860 + }, + { + "epoch": 5.331143951833607, + "grad_norm": 1.129770278930664, + "learning_rate": 0.0002, + "loss": 0.4098, + "step": 4870 + }, + { + "epoch": 5.342090859332239, + "grad_norm": 1.1461067199707031, + "learning_rate": 0.0002, + "loss": 0.4227, + "step": 4880 + }, + { + "epoch": 5.35303776683087, + "grad_norm": 1.3587424755096436, + "learning_rate": 0.0002, + "loss": 0.288, + "step": 4890 + }, + { + "epoch": 5.363984674329502, + "grad_norm": 1.6897879838943481, + "learning_rate": 0.0002, + "loss": 0.3604, + "step": 4900 + }, + { + "epoch": 5.374931581828133, + "grad_norm": 0.9298055768013, + "learning_rate": 0.0002, + "loss": 0.3887, + "step": 4910 + }, + { + "epoch": 5.385878489326765, + "grad_norm": 1.0006917715072632, + "learning_rate": 0.0002, + "loss": 0.3371, + "step": 4920 + }, + { + "epoch": 5.396825396825397, + "grad_norm": 1.232581377029419, + "learning_rate": 0.0002, + "loss": 0.3992, + "step": 4930 + }, + { + "epoch": 5.407772304324029, + "grad_norm": 1.0822620391845703, + "learning_rate": 0.0002, + "loss": 0.3456, + "step": 4940 + }, + { + "epoch": 5.41871921182266, + "grad_norm": 1.3648720979690552, + "learning_rate": 0.0002, + "loss": 0.3806, + "step": 4950 + }, + { + "epoch": 5.429666119321292, + "grad_norm": 1.3220354318618774, + "learning_rate": 0.0002, + "loss": 0.3959, + "step": 4960 + }, + { + "epoch": 5.440613026819923, + "grad_norm": 1.1106271743774414, + "learning_rate": 0.0002, + "loss": 0.3278, + "step": 4970 + }, + { + "epoch": 5.451559934318555, + "grad_norm": 1.6058908700942993, + "learning_rate": 0.0002, + "loss": 0.3812, + "step": 4980 + }, + { + "epoch": 5.462506841817187, + "grad_norm": 1.1065930128097534, + "learning_rate": 0.0002, + "loss": 0.3905, + "step": 4990 + }, + { + "epoch": 5.473453749315818, + "grad_norm": 1.3896466493606567, + "learning_rate": 0.0002, + "loss": 0.4058, + "step": 5000 + }, + { + "epoch": 5.48440065681445, + "grad_norm": 1.0437148809432983, + "learning_rate": 0.0002, + "loss": 0.4122, + "step": 5010 + }, + { + "epoch": 5.495347564313081, + "grad_norm": 1.2347718477249146, + "learning_rate": 0.0002, + "loss": 0.4065, + "step": 5020 + }, + { + "epoch": 5.506294471811713, + "grad_norm": 1.1174284219741821, + "learning_rate": 0.0002, + "loss": 0.3586, + "step": 5030 + }, + { + "epoch": 5.517241379310345, + "grad_norm": 1.2580941915512085, + "learning_rate": 0.0002, + "loss": 0.3576, + "step": 5040 + }, + { + "epoch": 5.528188286808977, + "grad_norm": 1.451090931892395, + "learning_rate": 0.0002, + "loss": 0.3809, + "step": 5050 + }, + { + "epoch": 5.539135194307608, + "grad_norm": 1.4688365459442139, + "learning_rate": 0.0002, + "loss": 0.3645, + "step": 5060 + }, + { + "epoch": 5.55008210180624, + "grad_norm": 1.1625734567642212, + "learning_rate": 0.0002, + "loss": 0.4431, + "step": 5070 + }, + { + "epoch": 5.561029009304871, + "grad_norm": 0.9332265257835388, + "learning_rate": 0.0002, + "loss": 0.3972, + "step": 5080 + }, + { + "epoch": 5.571975916803503, + "grad_norm": 1.5635273456573486, + "learning_rate": 0.0002, + "loss": 0.4, + "step": 5090 + }, + { + "epoch": 5.582922824302135, + "grad_norm": 1.3420509099960327, + "learning_rate": 0.0002, + "loss": 0.3651, + "step": 5100 + }, + { + "epoch": 5.593869731800766, + "grad_norm": 1.5826557874679565, + "learning_rate": 0.0002, + "loss": 0.3717, + "step": 5110 + }, + { + "epoch": 5.604816639299398, + "grad_norm": 1.5737065076828003, + "learning_rate": 0.0002, + "loss": 0.4256, + "step": 5120 + }, + { + "epoch": 5.615763546798029, + "grad_norm": 1.3812499046325684, + "learning_rate": 0.0002, + "loss": 0.39, + "step": 5130 + }, + { + "epoch": 5.626710454296661, + "grad_norm": 1.362833023071289, + "learning_rate": 0.0002, + "loss": 0.3891, + "step": 5140 + }, + { + "epoch": 5.637657361795293, + "grad_norm": 1.7667874097824097, + "learning_rate": 0.0002, + "loss": 0.455, + "step": 5150 + }, + { + "epoch": 5.648604269293925, + "grad_norm": 1.2661789655685425, + "learning_rate": 0.0002, + "loss": 0.4264, + "step": 5160 + }, + { + "epoch": 5.659551176792556, + "grad_norm": 1.2076870203018188, + "learning_rate": 0.0002, + "loss": 0.3261, + "step": 5170 + }, + { + "epoch": 5.670498084291188, + "grad_norm": 1.2431524991989136, + "learning_rate": 0.0002, + "loss": 0.372, + "step": 5180 + }, + { + "epoch": 5.681444991789819, + "grad_norm": 1.2216639518737793, + "learning_rate": 0.0002, + "loss": 0.4092, + "step": 5190 + }, + { + "epoch": 5.692391899288451, + "grad_norm": 0.9259352684020996, + "learning_rate": 0.0002, + "loss": 0.4171, + "step": 5200 + }, + { + "epoch": 5.703338806787083, + "grad_norm": 1.7929338216781616, + "learning_rate": 0.0002, + "loss": 0.3875, + "step": 5210 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 1.4048460721969604, + "learning_rate": 0.0002, + "loss": 0.4424, + "step": 5220 + }, + { + "epoch": 5.725232621784346, + "grad_norm": 1.306874394416809, + "learning_rate": 0.0002, + "loss": 0.3758, + "step": 5230 + }, + { + "epoch": 5.736179529282977, + "grad_norm": 1.3137940168380737, + "learning_rate": 0.0002, + "loss": 0.3889, + "step": 5240 + }, + { + "epoch": 5.747126436781609, + "grad_norm": 1.1376476287841797, + "learning_rate": 0.0002, + "loss": 0.4804, + "step": 5250 + }, + { + "epoch": 5.758073344280241, + "grad_norm": 1.450939416885376, + "learning_rate": 0.0002, + "loss": 0.377, + "step": 5260 + }, + { + "epoch": 5.769020251778873, + "grad_norm": 0.983195960521698, + "learning_rate": 0.0002, + "loss": 0.4732, + "step": 5270 + }, + { + "epoch": 5.779967159277504, + "grad_norm": 1.66558837890625, + "learning_rate": 0.0002, + "loss": 0.4041, + "step": 5280 + }, + { + "epoch": 5.790914066776136, + "grad_norm": 0.9789204597473145, + "learning_rate": 0.0002, + "loss": 0.3643, + "step": 5290 + }, + { + "epoch": 5.801860974274767, + "grad_norm": 1.2110556364059448, + "learning_rate": 0.0002, + "loss": 0.3776, + "step": 5300 + }, + { + "epoch": 5.812807881773399, + "grad_norm": 1.3799304962158203, + "learning_rate": 0.0002, + "loss": 0.4049, + "step": 5310 + }, + { + "epoch": 5.823754789272031, + "grad_norm": 1.0570626258850098, + "learning_rate": 0.0002, + "loss": 0.4362, + "step": 5320 + }, + { + "epoch": 5.834701696770662, + "grad_norm": 1.4654436111450195, + "learning_rate": 0.0002, + "loss": 0.4716, + "step": 5330 + }, + { + "epoch": 5.845648604269294, + "grad_norm": 1.5216940641403198, + "learning_rate": 0.0002, + "loss": 0.4048, + "step": 5340 + }, + { + "epoch": 5.856595511767925, + "grad_norm": 1.018646001815796, + "learning_rate": 0.0002, + "loss": 0.3848, + "step": 5350 + }, + { + "epoch": 5.867542419266557, + "grad_norm": 1.028951644897461, + "learning_rate": 0.0002, + "loss": 0.3705, + "step": 5360 + }, + { + "epoch": 5.878489326765189, + "grad_norm": 2.571263313293457, + "learning_rate": 0.0002, + "loss": 0.4213, + "step": 5370 + }, + { + "epoch": 5.889436234263821, + "grad_norm": 1.3323984146118164, + "learning_rate": 0.0002, + "loss": 0.3647, + "step": 5380 + }, + { + "epoch": 5.900383141762452, + "grad_norm": 1.4317777156829834, + "learning_rate": 0.0002, + "loss": 0.4085, + "step": 5390 + }, + { + "epoch": 5.911330049261084, + "grad_norm": 1.4289140701293945, + "learning_rate": 0.0002, + "loss": 0.4254, + "step": 5400 + }, + { + "epoch": 5.922276956759715, + "grad_norm": 1.3130780458450317, + "learning_rate": 0.0002, + "loss": 0.3993, + "step": 5410 + }, + { + "epoch": 5.933223864258347, + "grad_norm": 1.3979902267456055, + "learning_rate": 0.0002, + "loss": 0.4025, + "step": 5420 + }, + { + "epoch": 5.944170771756979, + "grad_norm": 1.1827352046966553, + "learning_rate": 0.0002, + "loss": 0.3997, + "step": 5430 + }, + { + "epoch": 5.95511767925561, + "grad_norm": 1.1672080755233765, + "learning_rate": 0.0002, + "loss": 0.4163, + "step": 5440 + }, + { + "epoch": 5.966064586754242, + "grad_norm": 1.0949620008468628, + "learning_rate": 0.0002, + "loss": 0.4425, + "step": 5450 + }, + { + "epoch": 5.977011494252873, + "grad_norm": 1.3183925151824951, + "learning_rate": 0.0002, + "loss": 0.4219, + "step": 5460 + }, + { + "epoch": 5.987958401751505, + "grad_norm": 1.096198320388794, + "learning_rate": 0.0002, + "loss": 0.4171, + "step": 5470 + }, + { + "epoch": 5.998905309250137, + "grad_norm": 1.2601423263549805, + "learning_rate": 0.0002, + "loss": 0.3886, + "step": 5480 + }, + { + "epoch": 6.0, + "eval_loss": 1.611358880996704, + "eval_runtime": 46.0638, + "eval_samples_per_second": 9.465, + "eval_steps_per_second": 1.194, + "step": 5481 + }, + { + "epoch": 6.009852216748769, + "grad_norm": 0.9854364991188049, + "learning_rate": 0.0002, + "loss": 0.2616, + "step": 5490 + }, + { + "epoch": 6.0207991242474, + "grad_norm": 1.8073689937591553, + "learning_rate": 0.0002, + "loss": 0.2412, + "step": 5500 + }, + { + "epoch": 6.031746031746032, + "grad_norm": 1.1852164268493652, + "learning_rate": 0.0002, + "loss": 0.2317, + "step": 5510 + }, + { + "epoch": 6.042692939244663, + "grad_norm": 1.0937914848327637, + "learning_rate": 0.0002, + "loss": 0.224, + "step": 5520 + }, + { + "epoch": 6.053639846743295, + "grad_norm": 0.7411194443702698, + "learning_rate": 0.0002, + "loss": 0.2473, + "step": 5530 + }, + { + "epoch": 6.064586754241927, + "grad_norm": 1.552127480506897, + "learning_rate": 0.0002, + "loss": 0.2846, + "step": 5540 + }, + { + "epoch": 6.075533661740558, + "grad_norm": 1.0465604066848755, + "learning_rate": 0.0002, + "loss": 0.2639, + "step": 5550 + }, + { + "epoch": 6.08648056923919, + "grad_norm": 1.4008121490478516, + "learning_rate": 0.0002, + "loss": 0.2696, + "step": 5560 + }, + { + "epoch": 6.097427476737821, + "grad_norm": 1.7049046754837036, + "learning_rate": 0.0002, + "loss": 0.3049, + "step": 5570 + }, + { + "epoch": 6.108374384236453, + "grad_norm": 1.111151933670044, + "learning_rate": 0.0002, + "loss": 0.263, + "step": 5580 + }, + { + "epoch": 6.119321291735085, + "grad_norm": 1.4271087646484375, + "learning_rate": 0.0002, + "loss": 0.2816, + "step": 5590 + }, + { + "epoch": 6.130268199233717, + "grad_norm": 1.3917373418807983, + "learning_rate": 0.0002, + "loss": 0.2878, + "step": 5600 + }, + { + "epoch": 6.141215106732348, + "grad_norm": 1.013689637184143, + "learning_rate": 0.0002, + "loss": 0.2482, + "step": 5610 + }, + { + "epoch": 6.15216201423098, + "grad_norm": 1.342645525932312, + "learning_rate": 0.0002, + "loss": 0.2841, + "step": 5620 + }, + { + "epoch": 6.163108921729611, + "grad_norm": 1.4480562210083008, + "learning_rate": 0.0002, + "loss": 0.2335, + "step": 5630 + }, + { + "epoch": 6.174055829228243, + "grad_norm": 1.2483175992965698, + "learning_rate": 0.0002, + "loss": 0.2696, + "step": 5640 + }, + { + "epoch": 6.185002736726875, + "grad_norm": 1.2944550514221191, + "learning_rate": 0.0002, + "loss": 0.2656, + "step": 5650 + }, + { + "epoch": 6.195949644225506, + "grad_norm": 1.264142632484436, + "learning_rate": 0.0002, + "loss": 0.2704, + "step": 5660 + }, + { + "epoch": 6.206896551724138, + "grad_norm": 1.2068781852722168, + "learning_rate": 0.0002, + "loss": 0.2971, + "step": 5670 + }, + { + "epoch": 6.217843459222769, + "grad_norm": 1.0401629209518433, + "learning_rate": 0.0002, + "loss": 0.2882, + "step": 5680 + }, + { + "epoch": 6.228790366721401, + "grad_norm": 1.2054402828216553, + "learning_rate": 0.0002, + "loss": 0.3022, + "step": 5690 + }, + { + "epoch": 6.239737274220033, + "grad_norm": 1.1278687715530396, + "learning_rate": 0.0002, + "loss": 0.2949, + "step": 5700 + }, + { + "epoch": 6.250684181718665, + "grad_norm": 1.24592125415802, + "learning_rate": 0.0002, + "loss": 0.2477, + "step": 5710 + }, + { + "epoch": 6.261631089217296, + "grad_norm": 1.2686697244644165, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 5720 + }, + { + "epoch": 6.272577996715928, + "grad_norm": 1.1836518049240112, + "learning_rate": 0.0002, + "loss": 0.2974, + "step": 5730 + }, + { + "epoch": 6.283524904214559, + "grad_norm": 1.387752890586853, + "learning_rate": 0.0002, + "loss": 0.2963, + "step": 5740 + }, + { + "epoch": 6.294471811713191, + "grad_norm": 1.9390363693237305, + "learning_rate": 0.0002, + "loss": 0.2961, + "step": 5750 + }, + { + "epoch": 6.305418719211823, + "grad_norm": 1.2919824123382568, + "learning_rate": 0.0002, + "loss": 0.2765, + "step": 5760 + }, + { + "epoch": 6.316365626710454, + "grad_norm": 1.2793965339660645, + "learning_rate": 0.0002, + "loss": 0.2898, + "step": 5770 + }, + { + "epoch": 6.327312534209086, + "grad_norm": 1.5486980676651, + "learning_rate": 0.0002, + "loss": 0.2786, + "step": 5780 + }, + { + "epoch": 6.338259441707717, + "grad_norm": 1.2757408618927002, + "learning_rate": 0.0002, + "loss": 0.2684, + "step": 5790 + }, + { + "epoch": 6.349206349206349, + "grad_norm": 1.3245713710784912, + "learning_rate": 0.0002, + "loss": 0.2841, + "step": 5800 + }, + { + "epoch": 6.360153256704981, + "grad_norm": 1.6262527704238892, + "learning_rate": 0.0002, + "loss": 0.3096, + "step": 5810 + }, + { + "epoch": 6.371100164203613, + "grad_norm": 1.465224027633667, + "learning_rate": 0.0002, + "loss": 0.3219, + "step": 5820 + }, + { + "epoch": 6.382047071702244, + "grad_norm": 1.437408447265625, + "learning_rate": 0.0002, + "loss": 0.2703, + "step": 5830 + }, + { + "epoch": 6.392993979200876, + "grad_norm": 1.3094626665115356, + "learning_rate": 0.0002, + "loss": 0.3012, + "step": 5840 + }, + { + "epoch": 6.403940886699507, + "grad_norm": 1.6717544794082642, + "learning_rate": 0.0002, + "loss": 0.2991, + "step": 5850 + }, + { + "epoch": 6.414887794198139, + "grad_norm": 1.1023344993591309, + "learning_rate": 0.0002, + "loss": 0.2892, + "step": 5860 + }, + { + "epoch": 6.425834701696771, + "grad_norm": 1.2397106885910034, + "learning_rate": 0.0002, + "loss": 0.3078, + "step": 5870 + }, + { + "epoch": 6.436781609195402, + "grad_norm": 1.6139185428619385, + "learning_rate": 0.0002, + "loss": 0.2984, + "step": 5880 + }, + { + "epoch": 6.447728516694034, + "grad_norm": 1.3164576292037964, + "learning_rate": 0.0002, + "loss": 0.2353, + "step": 5890 + }, + { + "epoch": 6.458675424192665, + "grad_norm": 1.3317217826843262, + "learning_rate": 0.0002, + "loss": 0.2772, + "step": 5900 + }, + { + "epoch": 6.469622331691297, + "grad_norm": 1.215008020401001, + "learning_rate": 0.0002, + "loss": 0.2555, + "step": 5910 + }, + { + "epoch": 6.480569239189929, + "grad_norm": 1.625672698020935, + "learning_rate": 0.0002, + "loss": 0.2715, + "step": 5920 + }, + { + "epoch": 6.491516146688561, + "grad_norm": 1.1262489557266235, + "learning_rate": 0.0002, + "loss": 0.2938, + "step": 5930 + }, + { + "epoch": 6.502463054187192, + "grad_norm": 1.447100281715393, + "learning_rate": 0.0002, + "loss": 0.2921, + "step": 5940 + }, + { + "epoch": 6.513409961685824, + "grad_norm": 1.3306448459625244, + "learning_rate": 0.0002, + "loss": 0.3059, + "step": 5950 + }, + { + "epoch": 6.524356869184455, + "grad_norm": 1.307732105255127, + "learning_rate": 0.0002, + "loss": 0.2922, + "step": 5960 + }, + { + "epoch": 6.535303776683087, + "grad_norm": 1.1851097345352173, + "learning_rate": 0.0002, + "loss": 0.2891, + "step": 5970 + }, + { + "epoch": 6.546250684181719, + "grad_norm": 1.462816596031189, + "learning_rate": 0.0002, + "loss": 0.2859, + "step": 5980 + }, + { + "epoch": 6.55719759168035, + "grad_norm": 1.2324728965759277, + "learning_rate": 0.0002, + "loss": 0.2698, + "step": 5990 + }, + { + "epoch": 6.568144499178982, + "grad_norm": 1.3627429008483887, + "learning_rate": 0.0002, + "loss": 0.2672, + "step": 6000 + }, + { + "epoch": 6.579091406677613, + "grad_norm": 1.94977867603302, + "learning_rate": 0.0002, + "loss": 0.3182, + "step": 6010 + }, + { + "epoch": 6.590038314176245, + "grad_norm": 1.459844946861267, + "learning_rate": 0.0002, + "loss": 0.3183, + "step": 6020 + }, + { + "epoch": 6.600985221674877, + "grad_norm": 1.4454325437545776, + "learning_rate": 0.0002, + "loss": 0.3142, + "step": 6030 + }, + { + "epoch": 6.611932129173509, + "grad_norm": 1.4245165586471558, + "learning_rate": 0.0002, + "loss": 0.269, + "step": 6040 + }, + { + "epoch": 6.62287903667214, + "grad_norm": 1.195803165435791, + "learning_rate": 0.0002, + "loss": 0.3041, + "step": 6050 + }, + { + "epoch": 6.633825944170772, + "grad_norm": 1.3589898347854614, + "learning_rate": 0.0002, + "loss": 0.3075, + "step": 6060 + }, + { + "epoch": 6.644772851669403, + "grad_norm": 1.3488036394119263, + "learning_rate": 0.0002, + "loss": 0.3291, + "step": 6070 + }, + { + "epoch": 6.655719759168035, + "grad_norm": 1.0954102277755737, + "learning_rate": 0.0002, + "loss": 0.2898, + "step": 6080 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.4431062936782837, + "learning_rate": 0.0002, + "loss": 0.3489, + "step": 6090 + }, + { + "epoch": 6.677613574165298, + "grad_norm": 1.4387465715408325, + "learning_rate": 0.0002, + "loss": 0.2816, + "step": 6100 + }, + { + "epoch": 6.68856048166393, + "grad_norm": 1.8398990631103516, + "learning_rate": 0.0002, + "loss": 0.2705, + "step": 6110 + }, + { + "epoch": 6.699507389162561, + "grad_norm": 1.3523273468017578, + "learning_rate": 0.0002, + "loss": 0.3214, + "step": 6120 + }, + { + "epoch": 6.710454296661193, + "grad_norm": 1.6326191425323486, + "learning_rate": 0.0002, + "loss": 0.287, + "step": 6130 + }, + { + "epoch": 6.721401204159825, + "grad_norm": 1.3677960634231567, + "learning_rate": 0.0002, + "loss": 0.2857, + "step": 6140 + }, + { + "epoch": 6.732348111658457, + "grad_norm": 1.1993201971054077, + "learning_rate": 0.0002, + "loss": 0.3264, + "step": 6150 + }, + { + "epoch": 6.743295019157088, + "grad_norm": 1.1864078044891357, + "learning_rate": 0.0002, + "loss": 0.3071, + "step": 6160 + }, + { + "epoch": 6.75424192665572, + "grad_norm": 1.1625522375106812, + "learning_rate": 0.0002, + "loss": 0.3087, + "step": 6170 + }, + { + "epoch": 6.765188834154351, + "grad_norm": 1.5803234577178955, + "learning_rate": 0.0002, + "loss": 0.3551, + "step": 6180 + }, + { + "epoch": 6.776135741652983, + "grad_norm": 1.151746153831482, + "learning_rate": 0.0002, + "loss": 0.3059, + "step": 6190 + }, + { + "epoch": 6.787082649151615, + "grad_norm": 1.0727161169052124, + "learning_rate": 0.0002, + "loss": 0.2697, + "step": 6200 + }, + { + "epoch": 6.798029556650246, + "grad_norm": 1.4148162603378296, + "learning_rate": 0.0002, + "loss": 0.2844, + "step": 6210 + }, + { + "epoch": 6.808976464148878, + "grad_norm": 1.2071447372436523, + "learning_rate": 0.0002, + "loss": 0.3417, + "step": 6220 + }, + { + "epoch": 6.819923371647509, + "grad_norm": 1.3843804597854614, + "learning_rate": 0.0002, + "loss": 0.3066, + "step": 6230 + }, + { + "epoch": 6.830870279146141, + "grad_norm": 1.2490662336349487, + "learning_rate": 0.0002, + "loss": 0.2769, + "step": 6240 + }, + { + "epoch": 6.841817186644773, + "grad_norm": 1.6029689311981201, + "learning_rate": 0.0002, + "loss": 0.3237, + "step": 6250 + }, + { + "epoch": 6.852764094143405, + "grad_norm": 1.0388455390930176, + "learning_rate": 0.0002, + "loss": 0.3152, + "step": 6260 + }, + { + "epoch": 6.863711001642036, + "grad_norm": 1.3883857727050781, + "learning_rate": 0.0002, + "loss": 0.3026, + "step": 6270 + }, + { + "epoch": 6.874657909140668, + "grad_norm": 1.0500187873840332, + "learning_rate": 0.0002, + "loss": 0.3175, + "step": 6280 + }, + { + "epoch": 6.885604816639299, + "grad_norm": 1.4243487119674683, + "learning_rate": 0.0002, + "loss": 0.2952, + "step": 6290 + }, + { + "epoch": 6.896551724137931, + "grad_norm": 1.3169665336608887, + "learning_rate": 0.0002, + "loss": 0.2679, + "step": 6300 + }, + { + "epoch": 6.907498631636563, + "grad_norm": 1.5261493921279907, + "learning_rate": 0.0002, + "loss": 0.3291, + "step": 6310 + }, + { + "epoch": 6.9184455391351944, + "grad_norm": 1.578403115272522, + "learning_rate": 0.0002, + "loss": 0.3344, + "step": 6320 + }, + { + "epoch": 6.929392446633826, + "grad_norm": 1.4093263149261475, + "learning_rate": 0.0002, + "loss": 0.3263, + "step": 6330 + }, + { + "epoch": 6.940339354132457, + "grad_norm": 1.4003552198410034, + "learning_rate": 0.0002, + "loss": 0.3396, + "step": 6340 + }, + { + "epoch": 6.951286261631089, + "grad_norm": 1.650190830230713, + "learning_rate": 0.0002, + "loss": 0.3476, + "step": 6350 + }, + { + "epoch": 6.962233169129721, + "grad_norm": 1.2314515113830566, + "learning_rate": 0.0002, + "loss": 0.3442, + "step": 6360 + }, + { + "epoch": 6.973180076628353, + "grad_norm": 1.270980954170227, + "learning_rate": 0.0002, + "loss": 0.3341, + "step": 6370 + }, + { + "epoch": 6.984126984126984, + "grad_norm": 1.6352545022964478, + "learning_rate": 0.0002, + "loss": 0.3425, + "step": 6380 + }, + { + "epoch": 6.995073891625616, + "grad_norm": 1.3744925260543823, + "learning_rate": 0.0002, + "loss": 0.3647, + "step": 6390 + }, + { + "epoch": 6.999452654625069, + "eval_loss": 1.756764531135559, + "eval_runtime": 46.0542, + "eval_samples_per_second": 9.467, + "eval_steps_per_second": 1.194, + "step": 6394 + } + ], + "logging_steps": 10, + "max_steps": 7304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.284454714192691e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..895406585c61ed7f5a6760976fc06a40a6fd8b3b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-6394/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a158bca77bc6d58bfffd2acd2e9e641d481e1857fcbdd68fee875b74797abbc4 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec5568997301ef15ec19890e1db44076874b660d --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e75d7b31a1782d17b5e60ef83b5a281f3a63ddf3550ecdd8d670c015ab44a3c3 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..36409135a3bc94b0a9f063bc966f43a9cb962cbd --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c634d73066f5266d0914318b725cc5ae40d7bb1c13681976f5672d9895120ca +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cbed656ccd7cafa4f68547e8cd86492c6d0ef2f4 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c5a84bd578fc3f9ab30db5f881ce38f2483cd31a0966710b640743309d399fd +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f0b65f305d95bfee75fc360c1debb95cf87eb45 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8de44f7d4d23605f9cddf0b9002140a8423325f3c130a3fe1b1f6033c2d9861e +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dc82a795067083bef62bc9d23a6705187ed03203 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/trainer_state.json @@ -0,0 +1,5207 @@ +{ + "best_metric": 1.14472496509552, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827", + "epoch": 7.995621237000547, + "eval_steps": 10, + "global_step": 7304, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.010946907498631636, + "grad_norm": 0.7611560821533203, + "learning_rate": 0.0002, + "loss": 2.4039, + "step": 10 + }, + { + "epoch": 0.021893814997263273, + "grad_norm": 0.4633193612098694, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 20 + }, + { + "epoch": 0.03284072249589491, + "grad_norm": 0.49326154589653015, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 30 + }, + { + "epoch": 0.043787629994526546, + "grad_norm": 0.3943138122558594, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 40 + }, + { + "epoch": 0.05473453749315818, + "grad_norm": 0.43292930722236633, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 50 + }, + { + "epoch": 0.06568144499178982, + "grad_norm": 0.3431817591190338, + "learning_rate": 0.0002, + "loss": 1.2427, + "step": 60 + }, + { + "epoch": 0.07662835249042145, + "grad_norm": 0.38011446595191956, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 70 + }, + { + "epoch": 0.08757525998905309, + "grad_norm": 0.7441071271896362, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 80 + }, + { + "epoch": 0.09852216748768473, + "grad_norm": 0.3359833061695099, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 90 + }, + { + "epoch": 0.10946907498631636, + "grad_norm": 0.3724392354488373, + "learning_rate": 0.0002, + "loss": 1.2774, + "step": 100 + }, + { + "epoch": 0.120415982484948, + "grad_norm": 0.40673762559890747, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 110 + }, + { + "epoch": 0.13136288998357964, + "grad_norm": 0.40036800503730774, + "learning_rate": 0.0002, + "loss": 1.2611, + "step": 120 + }, + { + "epoch": 0.1423097974822113, + "grad_norm": 2.844191312789917, + "learning_rate": 0.0002, + "loss": 1.2436, + "step": 130 + }, + { + "epoch": 0.1532567049808429, + "grad_norm": 0.3104734420776367, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 140 + }, + { + "epoch": 0.16420361247947454, + "grad_norm": 0.3266797959804535, + "learning_rate": 0.0002, + "loss": 1.0498, + "step": 150 + }, + { + "epoch": 0.17515051997810618, + "grad_norm": 0.3079199194908142, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 160 + }, + { + "epoch": 0.18609742747673783, + "grad_norm": 0.3872479498386383, + "learning_rate": 0.0002, + "loss": 1.2923, + "step": 170 + }, + { + "epoch": 0.19704433497536947, + "grad_norm": 0.38654500246047974, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 180 + }, + { + "epoch": 0.20799124247400108, + "grad_norm": 0.2913552522659302, + "learning_rate": 0.0002, + "loss": 1.066, + "step": 190 + }, + { + "epoch": 0.21893814997263272, + "grad_norm": 0.2960572838783264, + "learning_rate": 0.0002, + "loss": 1.0868, + "step": 200 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 0.5175501108169556, + "learning_rate": 0.0002, + "loss": 1.136, + "step": 210 + }, + { + "epoch": 0.240831964969896, + "grad_norm": 1.2921574115753174, + "learning_rate": 0.0002, + "loss": 1.1556, + "step": 220 + }, + { + "epoch": 0.25177887246852765, + "grad_norm": 0.2675512135028839, + "learning_rate": 0.0002, + "loss": 1.1488, + "step": 230 + }, + { + "epoch": 0.2627257799671593, + "grad_norm": 0.3956190049648285, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 240 + }, + { + "epoch": 0.27367268746579093, + "grad_norm": 0.6022581458091736, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 250 + }, + { + "epoch": 0.2846195949644226, + "grad_norm": 1.1949563026428223, + "learning_rate": 0.0002, + "loss": 1.1981, + "step": 260 + }, + { + "epoch": 0.2955665024630542, + "grad_norm": 0.31173548102378845, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 270 + }, + { + "epoch": 0.3065134099616858, + "grad_norm": 0.2808472812175751, + "learning_rate": 0.0002, + "loss": 1.06, + "step": 280 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 0.3042023777961731, + "learning_rate": 0.0002, + "loss": 1.1752, + "step": 290 + }, + { + "epoch": 0.3284072249589491, + "grad_norm": 0.39915043115615845, + "learning_rate": 0.0002, + "loss": 1.3147, + "step": 300 + }, + { + "epoch": 0.3393541324575807, + "grad_norm": 0.39118197560310364, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 310 + }, + { + "epoch": 0.35030103995621237, + "grad_norm": 0.355010986328125, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 320 + }, + { + "epoch": 0.361247947454844, + "grad_norm": 0.29734086990356445, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 330 + }, + { + "epoch": 0.37219485495347565, + "grad_norm": 0.346096009016037, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 340 + }, + { + "epoch": 0.3831417624521073, + "grad_norm": 0.4829643666744232, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 350 + }, + { + "epoch": 0.39408866995073893, + "grad_norm": 0.4726872742176056, + "learning_rate": 0.0002, + "loss": 1.2808, + "step": 360 + }, + { + "epoch": 0.4050355774493706, + "grad_norm": 0.3130153715610504, + "learning_rate": 0.0002, + "loss": 1.1532, + "step": 370 + }, + { + "epoch": 0.41598248494800216, + "grad_norm": 0.5123590230941772, + "learning_rate": 0.0002, + "loss": 1.1842, + "step": 380 + }, + { + "epoch": 0.4269293924466338, + "grad_norm": 0.3444574773311615, + "learning_rate": 0.0002, + "loss": 1.1539, + "step": 390 + }, + { + "epoch": 0.43787629994526545, + "grad_norm": 0.5302175283432007, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 400 + }, + { + "epoch": 0.4488232074438971, + "grad_norm": 0.2713572680950165, + "learning_rate": 0.0002, + "loss": 1.1138, + "step": 410 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 0.29530611634254456, + "learning_rate": 0.0002, + "loss": 1.1281, + "step": 420 + }, + { + "epoch": 0.47071702244116037, + "grad_norm": 0.27282455563545227, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 430 + }, + { + "epoch": 0.481663929939792, + "grad_norm": 0.2647949755191803, + "learning_rate": 0.0002, + "loss": 1.1213, + "step": 440 + }, + { + "epoch": 0.49261083743842365, + "grad_norm": 0.35509347915649414, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 450 + }, + { + "epoch": 0.5035577449370553, + "grad_norm": 0.1959609091281891, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 460 + }, + { + "epoch": 0.5145046524356869, + "grad_norm": 0.40090155601501465, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 470 + }, + { + "epoch": 0.5254515599343186, + "grad_norm": 0.3354604244232178, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 480 + }, + { + "epoch": 0.5363984674329502, + "grad_norm": 0.2758506238460541, + "learning_rate": 0.0002, + "loss": 1.1024, + "step": 490 + }, + { + "epoch": 0.5473453749315819, + "grad_norm": 0.3629051744937897, + "learning_rate": 0.0002, + "loss": 1.1108, + "step": 500 + }, + { + "epoch": 0.5582922824302134, + "grad_norm": 0.30802229046821594, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 510 + }, + { + "epoch": 0.5692391899288451, + "grad_norm": 0.3099463880062103, + "learning_rate": 0.0002, + "loss": 1.0424, + "step": 520 + }, + { + "epoch": 0.5801860974274767, + "grad_norm": 0.42299067974090576, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 530 + }, + { + "epoch": 0.5911330049261084, + "grad_norm": 0.5392252802848816, + "learning_rate": 0.0002, + "loss": 1.1698, + "step": 540 + }, + { + "epoch": 0.60207991242474, + "grad_norm": 0.34768250584602356, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 550 + }, + { + "epoch": 0.6130268199233716, + "grad_norm": 0.28490015864372253, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 560 + }, + { + "epoch": 0.6239737274220033, + "grad_norm": 0.34787994623184204, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 570 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.29058772325515747, + "learning_rate": 0.0002, + "loss": 1.0759, + "step": 580 + }, + { + "epoch": 0.6458675424192666, + "grad_norm": 0.4063778817653656, + "learning_rate": 0.0002, + "loss": 1.1157, + "step": 590 + }, + { + "epoch": 0.6568144499178982, + "grad_norm": 0.9244267344474792, + "learning_rate": 0.0002, + "loss": 1.1432, + "step": 600 + }, + { + "epoch": 0.6677613574165299, + "grad_norm": 0.27605190873146057, + "learning_rate": 0.0002, + "loss": 1.0591, + "step": 610 + }, + { + "epoch": 0.6787082649151615, + "grad_norm": 0.34346821904182434, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 620 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.3093279302120209, + "learning_rate": 0.0002, + "loss": 1.2195, + "step": 630 + }, + { + "epoch": 0.7006020799124247, + "grad_norm": 1.0069009065628052, + "learning_rate": 0.0002, + "loss": 1.2461, + "step": 640 + }, + { + "epoch": 0.7115489874110563, + "grad_norm": 0.5049130916595459, + "learning_rate": 0.0002, + "loss": 1.0892, + "step": 650 + }, + { + "epoch": 0.722495894909688, + "grad_norm": 0.3748924732208252, + "learning_rate": 0.0002, + "loss": 1.1404, + "step": 660 + }, + { + "epoch": 0.7334428024083196, + "grad_norm": 0.2964959144592285, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 670 + }, + { + "epoch": 0.7443897099069513, + "grad_norm": 0.4599986970424652, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 680 + }, + { + "epoch": 0.7553366174055829, + "grad_norm": 0.27292951941490173, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 690 + }, + { + "epoch": 0.7662835249042146, + "grad_norm": 0.3123566806316376, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 700 + }, + { + "epoch": 0.7772304324028462, + "grad_norm": 0.28310710191726685, + "learning_rate": 0.0002, + "loss": 1.0021, + "step": 710 + }, + { + "epoch": 0.7881773399014779, + "grad_norm": 0.3279992341995239, + "learning_rate": 0.0002, + "loss": 1.2281, + "step": 720 + }, + { + "epoch": 0.7991242474001095, + "grad_norm": 0.28179168701171875, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 730 + }, + { + "epoch": 0.8100711548987412, + "grad_norm": 0.31492987275123596, + "learning_rate": 0.0002, + "loss": 1.0602, + "step": 740 + }, + { + "epoch": 0.8210180623973727, + "grad_norm": 0.41821011900901794, + "learning_rate": 0.0002, + "loss": 1.2518, + "step": 750 + }, + { + "epoch": 0.8319649698960043, + "grad_norm": 0.325235515832901, + "learning_rate": 0.0002, + "loss": 1.1612, + "step": 760 + }, + { + "epoch": 0.842911877394636, + "grad_norm": 0.5366070866584778, + "learning_rate": 0.0002, + "loss": 1.27, + "step": 770 + }, + { + "epoch": 0.8538587848932676, + "grad_norm": 0.32570579648017883, + "learning_rate": 0.0002, + "loss": 1.0921, + "step": 780 + }, + { + "epoch": 0.8648056923918993, + "grad_norm": 0.3642968237400055, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 790 + }, + { + "epoch": 0.8757525998905309, + "grad_norm": 0.29713448882102966, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 800 + }, + { + "epoch": 0.8866995073891626, + "grad_norm": 0.23599444329738617, + "learning_rate": 0.0002, + "loss": 1.0978, + "step": 810 + }, + { + "epoch": 0.8976464148877942, + "grad_norm": 0.31522464752197266, + "learning_rate": 0.0002, + "loss": 1.1867, + "step": 820 + }, + { + "epoch": 0.9085933223864259, + "grad_norm": 0.32754790782928467, + "learning_rate": 0.0002, + "loss": 1.0208, + "step": 830 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 0.22741089761257172, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 840 + }, + { + "epoch": 0.9304871373836892, + "grad_norm": 0.3089679777622223, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 850 + }, + { + "epoch": 0.9414340448823207, + "grad_norm": 0.27440521121025085, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 860 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.3498363792896271, + "learning_rate": 0.0002, + "loss": 1.0417, + "step": 870 + }, + { + "epoch": 0.963327859879584, + "grad_norm": 0.47151854634284973, + "learning_rate": 0.0002, + "loss": 1.269, + "step": 880 + }, + { + "epoch": 0.9742747673782156, + "grad_norm": 0.24756591022014618, + "learning_rate": 0.0002, + "loss": 1.1174, + "step": 890 + }, + { + "epoch": 0.9852216748768473, + "grad_norm": 0.2600938677787781, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 900 + }, + { + "epoch": 0.9961685823754789, + "grad_norm": 0.2934586703777313, + "learning_rate": 0.0002, + "loss": 1.0968, + "step": 910 + }, + { + "epoch": 0.9994526546250684, + "eval_loss": 1.158464789390564, + "eval_runtime": 46.0774, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 1.194, + "step": 913 + }, + { + "epoch": 1.0071154898741106, + "grad_norm": 0.37776654958724976, + "learning_rate": 0.0002, + "loss": 1.0322, + "step": 920 + }, + { + "epoch": 1.0180623973727423, + "grad_norm": 0.31784629821777344, + "learning_rate": 0.0002, + "loss": 1.1346, + "step": 930 + }, + { + "epoch": 1.0290093048713738, + "grad_norm": 0.24244336783885956, + "learning_rate": 0.0002, + "loss": 0.8976, + "step": 940 + }, + { + "epoch": 1.0399562123700055, + "grad_norm": 0.3185454308986664, + "learning_rate": 0.0002, + "loss": 1.0172, + "step": 950 + }, + { + "epoch": 1.0509031198686372, + "grad_norm": 0.3589441478252411, + "learning_rate": 0.0002, + "loss": 1.1645, + "step": 960 + }, + { + "epoch": 1.0618500273672686, + "grad_norm": 0.38593578338623047, + "learning_rate": 0.0002, + "loss": 1.0317, + "step": 970 + }, + { + "epoch": 1.0727969348659003, + "grad_norm": 0.39694955945014954, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 980 + }, + { + "epoch": 1.083743842364532, + "grad_norm": 0.469817191362381, + "learning_rate": 0.0002, + "loss": 1.0893, + "step": 990 + }, + { + "epoch": 1.0946907498631637, + "grad_norm": 0.2634755074977875, + "learning_rate": 0.0002, + "loss": 0.983, + "step": 1000 + }, + { + "epoch": 1.1056376573617952, + "grad_norm": 0.43189436197280884, + "learning_rate": 0.0002, + "loss": 1.0144, + "step": 1010 + }, + { + "epoch": 1.116584564860427, + "grad_norm": 0.5559977889060974, + "learning_rate": 0.0002, + "loss": 0.9663, + "step": 1020 + }, + { + "epoch": 1.1275314723590586, + "grad_norm": 0.32100191712379456, + "learning_rate": 0.0002, + "loss": 1.0481, + "step": 1030 + }, + { + "epoch": 1.1384783798576903, + "grad_norm": 0.40179768204689026, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 1040 + }, + { + "epoch": 1.1494252873563218, + "grad_norm": 0.3659493029117584, + "learning_rate": 0.0002, + "loss": 1.0029, + "step": 1050 + }, + { + "epoch": 1.1603721948549535, + "grad_norm": 0.701704204082489, + "learning_rate": 0.0002, + "loss": 0.9597, + "step": 1060 + }, + { + "epoch": 1.1713191023535852, + "grad_norm": 0.3650563359260559, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 1070 + }, + { + "epoch": 1.1822660098522166, + "grad_norm": 0.3191976249217987, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 1080 + }, + { + "epoch": 1.1932129173508483, + "grad_norm": 0.3615441918373108, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 1090 + }, + { + "epoch": 1.20415982484948, + "grad_norm": 0.39474231004714966, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 1100 + }, + { + "epoch": 1.2151067323481117, + "grad_norm": 0.3752822279930115, + "learning_rate": 0.0002, + "loss": 0.9852, + "step": 1110 + }, + { + "epoch": 1.2260536398467432, + "grad_norm": 0.4165991246700287, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 1120 + }, + { + "epoch": 1.237000547345375, + "grad_norm": 0.5326506495475769, + "learning_rate": 0.0002, + "loss": 1.0244, + "step": 1130 + }, + { + "epoch": 1.2479474548440066, + "grad_norm": 0.48845794796943665, + "learning_rate": 0.0002, + "loss": 1.0542, + "step": 1140 + }, + { + "epoch": 1.2588943623426383, + "grad_norm": 0.29910150170326233, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 1150 + }, + { + "epoch": 1.2698412698412698, + "grad_norm": 0.5069725513458252, + "learning_rate": 0.0002, + "loss": 1.2233, + "step": 1160 + }, + { + "epoch": 1.2807881773399015, + "grad_norm": 0.29500406980514526, + "learning_rate": 0.0002, + "loss": 1.0992, + "step": 1170 + }, + { + "epoch": 1.2917350848385332, + "grad_norm": 0.4711538851261139, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 1180 + }, + { + "epoch": 1.3026819923371646, + "grad_norm": 0.4203340709209442, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 1190 + }, + { + "epoch": 1.3136288998357963, + "grad_norm": 0.36101874709129333, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 1200 + }, + { + "epoch": 1.324575807334428, + "grad_norm": 0.4608800411224365, + "learning_rate": 0.0002, + "loss": 1.085, + "step": 1210 + }, + { + "epoch": 1.3355227148330597, + "grad_norm": 0.6570906639099121, + "learning_rate": 0.0002, + "loss": 1.0695, + "step": 1220 + }, + { + "epoch": 1.3464696223316914, + "grad_norm": 0.5352164506912231, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 1230 + }, + { + "epoch": 1.357416529830323, + "grad_norm": 0.3885001242160797, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 1240 + }, + { + "epoch": 1.3683634373289546, + "grad_norm": 0.2987913489341736, + "learning_rate": 0.0002, + "loss": 1.0611, + "step": 1250 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.42070427536964417, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 1260 + }, + { + "epoch": 1.3902572523262178, + "grad_norm": 0.5957782864570618, + "learning_rate": 0.0002, + "loss": 1.1092, + "step": 1270 + }, + { + "epoch": 1.4012041598248495, + "grad_norm": 0.32898882031440735, + "learning_rate": 0.0002, + "loss": 1.0554, + "step": 1280 + }, + { + "epoch": 1.4121510673234812, + "grad_norm": 0.27624452114105225, + "learning_rate": 0.0002, + "loss": 0.909, + "step": 1290 + }, + { + "epoch": 1.4230979748221126, + "grad_norm": 0.49570828676223755, + "learning_rate": 0.0002, + "loss": 1.024, + "step": 1300 + }, + { + "epoch": 1.4340448823207443, + "grad_norm": 0.26191383600234985, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 1310 + }, + { + "epoch": 1.444991789819376, + "grad_norm": 0.35664042830467224, + "learning_rate": 0.0002, + "loss": 1.0788, + "step": 1320 + }, + { + "epoch": 1.4559386973180077, + "grad_norm": 0.45126354694366455, + "learning_rate": 0.0002, + "loss": 1.0108, + "step": 1330 + }, + { + "epoch": 1.4668856048166394, + "grad_norm": 0.37318357825279236, + "learning_rate": 0.0002, + "loss": 0.9571, + "step": 1340 + }, + { + "epoch": 1.477832512315271, + "grad_norm": 0.6428970098495483, + "learning_rate": 0.0002, + "loss": 1.0507, + "step": 1350 + }, + { + "epoch": 1.4887794198139026, + "grad_norm": 0.43256187438964844, + "learning_rate": 0.0002, + "loss": 1.0367, + "step": 1360 + }, + { + "epoch": 1.4997263273125343, + "grad_norm": 0.5343793630599976, + "learning_rate": 0.0002, + "loss": 1.1321, + "step": 1370 + }, + { + "epoch": 1.5106732348111658, + "grad_norm": 0.315437376499176, + "learning_rate": 0.0002, + "loss": 1.1054, + "step": 1380 + }, + { + "epoch": 1.5216201423097975, + "grad_norm": 0.41561153531074524, + "learning_rate": 0.0002, + "loss": 0.8916, + "step": 1390 + }, + { + "epoch": 1.5325670498084292, + "grad_norm": 0.3201070725917816, + "learning_rate": 0.0002, + "loss": 1.0391, + "step": 1400 + }, + { + "epoch": 1.5435139573070606, + "grad_norm": 0.505537211894989, + "learning_rate": 0.0002, + "loss": 1.14, + "step": 1410 + }, + { + "epoch": 1.5544608648056923, + "grad_norm": 0.3747410178184509, + "learning_rate": 0.0002, + "loss": 1.0775, + "step": 1420 + }, + { + "epoch": 1.565407772304324, + "grad_norm": 0.49385908246040344, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 1430 + }, + { + "epoch": 1.5763546798029555, + "grad_norm": 0.49831628799438477, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 1440 + }, + { + "epoch": 1.5873015873015874, + "grad_norm": 0.372127890586853, + "learning_rate": 0.0002, + "loss": 1.0079, + "step": 1450 + }, + { + "epoch": 1.598248494800219, + "grad_norm": 0.40070840716362, + "learning_rate": 0.0002, + "loss": 0.9931, + "step": 1460 + }, + { + "epoch": 1.6091954022988506, + "grad_norm": 0.34907400608062744, + "learning_rate": 0.0002, + "loss": 0.8954, + "step": 1470 + }, + { + "epoch": 1.6201423097974823, + "grad_norm": 0.4632418751716614, + "learning_rate": 0.0002, + "loss": 0.9743, + "step": 1480 + }, + { + "epoch": 1.6310892172961138, + "grad_norm": 0.40164515376091003, + "learning_rate": 0.0002, + "loss": 1.0103, + "step": 1490 + }, + { + "epoch": 1.6420361247947455, + "grad_norm": 0.3214994966983795, + "learning_rate": 0.0002, + "loss": 0.9523, + "step": 1500 + }, + { + "epoch": 1.6529830322933772, + "grad_norm": 0.3727897107601166, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 1510 + }, + { + "epoch": 1.6639299397920086, + "grad_norm": 0.3817640542984009, + "learning_rate": 0.0002, + "loss": 1.0443, + "step": 1520 + }, + { + "epoch": 1.6748768472906403, + "grad_norm": 0.5592136979103088, + "learning_rate": 0.0002, + "loss": 1.0511, + "step": 1530 + }, + { + "epoch": 1.685823754789272, + "grad_norm": 0.44636598229408264, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 1540 + }, + { + "epoch": 1.6967706622879035, + "grad_norm": 0.40441709756851196, + "learning_rate": 0.0002, + "loss": 1.033, + "step": 1550 + }, + { + "epoch": 1.7077175697865354, + "grad_norm": 0.3243522644042969, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 1560 + }, + { + "epoch": 1.718664477285167, + "grad_norm": 0.34277570247650146, + "learning_rate": 0.0002, + "loss": 0.9482, + "step": 1570 + }, + { + "epoch": 1.7296113847837986, + "grad_norm": 0.3279995024204254, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 1580 + }, + { + "epoch": 1.7405582922824303, + "grad_norm": 0.41968777775764465, + "learning_rate": 0.0002, + "loss": 0.9414, + "step": 1590 + }, + { + "epoch": 1.7515051997810618, + "grad_norm": 0.39464613795280457, + "learning_rate": 0.0002, + "loss": 0.9768, + "step": 1600 + }, + { + "epoch": 1.7624521072796935, + "grad_norm": 0.3839009404182434, + "learning_rate": 0.0002, + "loss": 1.0347, + "step": 1610 + }, + { + "epoch": 1.7733990147783252, + "grad_norm": 0.3250715434551239, + "learning_rate": 0.0002, + "loss": 0.9195, + "step": 1620 + }, + { + "epoch": 1.7843459222769567, + "grad_norm": 0.5166561007499695, + "learning_rate": 0.0002, + "loss": 1.0049, + "step": 1630 + }, + { + "epoch": 1.7952928297755884, + "grad_norm": 0.4115183353424072, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 1640 + }, + { + "epoch": 1.80623973727422, + "grad_norm": 0.373780220746994, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 1650 + }, + { + "epoch": 1.8171866447728515, + "grad_norm": 0.49697014689445496, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 1660 + }, + { + "epoch": 1.8281335522714834, + "grad_norm": 1.0308938026428223, + "learning_rate": 0.0002, + "loss": 1.0089, + "step": 1670 + }, + { + "epoch": 1.839080459770115, + "grad_norm": 0.4851366877555847, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 1680 + }, + { + "epoch": 1.8500273672687466, + "grad_norm": 0.3262481391429901, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 1690 + }, + { + "epoch": 1.8609742747673783, + "grad_norm": 0.6904496550559998, + "learning_rate": 0.0002, + "loss": 1.048, + "step": 1700 + }, + { + "epoch": 1.8719211822660098, + "grad_norm": 0.49789851903915405, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 1710 + }, + { + "epoch": 1.8828680897646415, + "grad_norm": 0.3035794198513031, + "learning_rate": 0.0002, + "loss": 1.0258, + "step": 1720 + }, + { + "epoch": 1.8938149972632732, + "grad_norm": 0.4588414430618286, + "learning_rate": 0.0002, + "loss": 0.9916, + "step": 1730 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.4313034117221832, + "learning_rate": 0.0002, + "loss": 0.9526, + "step": 1740 + }, + { + "epoch": 1.9157088122605364, + "grad_norm": 0.38562044501304626, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 1750 + }, + { + "epoch": 1.926655719759168, + "grad_norm": 0.46947410702705383, + "learning_rate": 0.0002, + "loss": 0.9763, + "step": 1760 + }, + { + "epoch": 1.9376026272577995, + "grad_norm": 0.3848404884338379, + "learning_rate": 0.0002, + "loss": 0.9967, + "step": 1770 + }, + { + "epoch": 1.9485495347564314, + "grad_norm": 0.30422744154930115, + "learning_rate": 0.0002, + "loss": 1.0474, + "step": 1780 + }, + { + "epoch": 1.959496442255063, + "grad_norm": 0.41100990772247314, + "learning_rate": 0.0002, + "loss": 1.022, + "step": 1790 + }, + { + "epoch": 1.9704433497536946, + "grad_norm": 0.3492335081100464, + "learning_rate": 0.0002, + "loss": 1.003, + "step": 1800 + }, + { + "epoch": 1.9813902572523263, + "grad_norm": 0.364577978849411, + "learning_rate": 0.0002, + "loss": 0.9891, + "step": 1810 + }, + { + "epoch": 1.9923371647509578, + "grad_norm": 0.4312075674533844, + "learning_rate": 0.0002, + "loss": 1.0699, + "step": 1820 + }, + { + "epoch": 2.0, + "eval_loss": 1.14472496509552, + "eval_runtime": 46.0786, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 1.194, + "step": 1827 + }, + { + "epoch": 2.0032840722495897, + "grad_norm": 0.5989689230918884, + "learning_rate": 0.0002, + "loss": 1.0218, + "step": 1830 + }, + { + "epoch": 2.014230979748221, + "grad_norm": 0.49720922112464905, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 1840 + }, + { + "epoch": 2.0251778872468527, + "grad_norm": 0.42675456404685974, + "learning_rate": 0.0002, + "loss": 0.8482, + "step": 1850 + }, + { + "epoch": 2.0361247947454846, + "grad_norm": 0.4637208580970764, + "learning_rate": 0.0002, + "loss": 0.8487, + "step": 1860 + }, + { + "epoch": 2.047071702244116, + "grad_norm": 0.8329976797103882, + "learning_rate": 0.0002, + "loss": 0.8212, + "step": 1870 + }, + { + "epoch": 2.0580186097427475, + "grad_norm": 0.7869427800178528, + "learning_rate": 0.0002, + "loss": 0.9547, + "step": 1880 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.4927455186843872, + "learning_rate": 0.0002, + "loss": 0.9351, + "step": 1890 + }, + { + "epoch": 2.079912424740011, + "grad_norm": 0.6264246702194214, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 1900 + }, + { + "epoch": 2.0908593322386424, + "grad_norm": 1.1164122819900513, + "learning_rate": 0.0002, + "loss": 0.9095, + "step": 1910 + }, + { + "epoch": 2.1018062397372743, + "grad_norm": 0.5283981561660767, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 1920 + }, + { + "epoch": 2.112753147235906, + "grad_norm": 0.45621731877326965, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 1930 + }, + { + "epoch": 2.1237000547345373, + "grad_norm": 1.381791591644287, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 1940 + }, + { + "epoch": 2.134646962233169, + "grad_norm": 0.5151259899139404, + "learning_rate": 0.0002, + "loss": 0.9006, + "step": 1950 + }, + { + "epoch": 2.1455938697318007, + "grad_norm": 0.9806339740753174, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 1960 + }, + { + "epoch": 2.1565407772304326, + "grad_norm": 0.4734154939651489, + "learning_rate": 0.0002, + "loss": 0.8749, + "step": 1970 + }, + { + "epoch": 2.167487684729064, + "grad_norm": 0.9553168416023254, + "learning_rate": 0.0002, + "loss": 0.9172, + "step": 1980 + }, + { + "epoch": 2.1784345922276955, + "grad_norm": 0.5895838141441345, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1990 + }, + { + "epoch": 2.1893814997263275, + "grad_norm": 0.4488855302333832, + "learning_rate": 0.0002, + "loss": 0.7841, + "step": 2000 + }, + { + "epoch": 2.200328407224959, + "grad_norm": 1.0760235786437988, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 2010 + }, + { + "epoch": 2.2112753147235904, + "grad_norm": 0.5038785338401794, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 2020 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.59819495677948, + "learning_rate": 0.0002, + "loss": 0.8973, + "step": 2030 + }, + { + "epoch": 2.233169129720854, + "grad_norm": 0.5012075304985046, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 2040 + }, + { + "epoch": 2.2441160372194853, + "grad_norm": 0.44978439807891846, + "learning_rate": 0.0002, + "loss": 0.9087, + "step": 2050 + }, + { + "epoch": 2.255062944718117, + "grad_norm": 0.5350462198257446, + "learning_rate": 0.0002, + "loss": 0.823, + "step": 2060 + }, + { + "epoch": 2.2660098522167487, + "grad_norm": 0.6020669937133789, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 2070 + }, + { + "epoch": 2.2769567597153806, + "grad_norm": 0.5246821045875549, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 2080 + }, + { + "epoch": 2.287903667214012, + "grad_norm": 0.5711268782615662, + "learning_rate": 0.0002, + "loss": 0.8984, + "step": 2090 + }, + { + "epoch": 2.2988505747126435, + "grad_norm": 0.617317259311676, + "learning_rate": 0.0002, + "loss": 0.9093, + "step": 2100 + }, + { + "epoch": 2.3097974822112755, + "grad_norm": 0.8608947396278381, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 2110 + }, + { + "epoch": 2.320744389709907, + "grad_norm": 0.4739076793193817, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 2120 + }, + { + "epoch": 2.3316912972085384, + "grad_norm": 0.5538856983184814, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 2130 + }, + { + "epoch": 2.3426382047071703, + "grad_norm": 0.6064935326576233, + "learning_rate": 0.0002, + "loss": 0.8994, + "step": 2140 + }, + { + "epoch": 2.353585112205802, + "grad_norm": 0.5019068121910095, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 2150 + }, + { + "epoch": 2.3645320197044333, + "grad_norm": 0.45340514183044434, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 2160 + }, + { + "epoch": 2.375478927203065, + "grad_norm": 0.7347203493118286, + "learning_rate": 0.0002, + "loss": 0.9254, + "step": 2170 + }, + { + "epoch": 2.3864258347016967, + "grad_norm": 0.46922534704208374, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 2180 + }, + { + "epoch": 2.3973727422003286, + "grad_norm": 0.5507845878601074, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2190 + }, + { + "epoch": 2.40831964969896, + "grad_norm": 0.5621911883354187, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2200 + }, + { + "epoch": 2.4192665571975915, + "grad_norm": 0.5023514032363892, + "learning_rate": 0.0002, + "loss": 0.8569, + "step": 2210 + }, + { + "epoch": 2.4302134646962235, + "grad_norm": 0.6124861240386963, + "learning_rate": 0.0002, + "loss": 0.8355, + "step": 2220 + }, + { + "epoch": 2.441160372194855, + "grad_norm": 0.49614205956459045, + "learning_rate": 0.0002, + "loss": 0.885, + "step": 2230 + }, + { + "epoch": 2.4521072796934864, + "grad_norm": 0.6477900743484497, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 2240 + }, + { + "epoch": 2.4630541871921183, + "grad_norm": 0.5868843793869019, + "learning_rate": 0.0002, + "loss": 0.8622, + "step": 2250 + }, + { + "epoch": 2.47400109469075, + "grad_norm": 0.4364610016345978, + "learning_rate": 0.0002, + "loss": 0.8498, + "step": 2260 + }, + { + "epoch": 2.4849480021893813, + "grad_norm": 0.5792964696884155, + "learning_rate": 0.0002, + "loss": 0.8378, + "step": 2270 + }, + { + "epoch": 2.495894909688013, + "grad_norm": 0.5421269536018372, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 2280 + }, + { + "epoch": 2.5068418171866447, + "grad_norm": 0.5525493025779724, + "learning_rate": 0.0002, + "loss": 0.9637, + "step": 2290 + }, + { + "epoch": 2.5177887246852766, + "grad_norm": 0.6463850140571594, + "learning_rate": 0.0002, + "loss": 0.8075, + "step": 2300 + }, + { + "epoch": 2.528735632183908, + "grad_norm": 0.6861311793327332, + "learning_rate": 0.0002, + "loss": 0.8591, + "step": 2310 + }, + { + "epoch": 2.5396825396825395, + "grad_norm": 0.5563555359840393, + "learning_rate": 0.0002, + "loss": 0.9287, + "step": 2320 + }, + { + "epoch": 2.5506294471811715, + "grad_norm": 0.5721169114112854, + "learning_rate": 0.0002, + "loss": 0.945, + "step": 2330 + }, + { + "epoch": 2.561576354679803, + "grad_norm": 0.5258274674415588, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 2340 + }, + { + "epoch": 2.572523262178435, + "grad_norm": 0.7057380676269531, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 2350 + }, + { + "epoch": 2.5834701696770663, + "grad_norm": 0.6869027614593506, + "learning_rate": 0.0002, + "loss": 0.8615, + "step": 2360 + }, + { + "epoch": 2.594417077175698, + "grad_norm": 0.4960809648036957, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2370 + }, + { + "epoch": 2.6053639846743293, + "grad_norm": 0.9288380146026611, + "learning_rate": 0.0002, + "loss": 0.8476, + "step": 2380 + }, + { + "epoch": 2.616310892172961, + "grad_norm": 0.3765334188938141, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 2390 + }, + { + "epoch": 2.6272577996715927, + "grad_norm": 0.7487865686416626, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 2400 + }, + { + "epoch": 2.6382047071702246, + "grad_norm": 0.6141156554222107, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 2410 + }, + { + "epoch": 2.649151614668856, + "grad_norm": 0.8420507907867432, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 2420 + }, + { + "epoch": 2.6600985221674875, + "grad_norm": 0.53386390209198, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 2430 + }, + { + "epoch": 2.6710454296661195, + "grad_norm": 0.5520607233047485, + "learning_rate": 0.0002, + "loss": 0.8486, + "step": 2440 + }, + { + "epoch": 2.681992337164751, + "grad_norm": 0.5337599515914917, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 2450 + }, + { + "epoch": 2.692939244663383, + "grad_norm": 0.48790836334228516, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 2460 + }, + { + "epoch": 2.7038861521620143, + "grad_norm": 0.8287786245346069, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 2470 + }, + { + "epoch": 2.714833059660646, + "grad_norm": 0.5876168608665466, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 2480 + }, + { + "epoch": 2.7257799671592773, + "grad_norm": 0.5206760764122009, + "learning_rate": 0.0002, + "loss": 0.8773, + "step": 2490 + }, + { + "epoch": 2.736726874657909, + "grad_norm": 0.5619136691093445, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 2500 + }, + { + "epoch": 2.7476737821565407, + "grad_norm": 0.5614883899688721, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 2510 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.6157700419425964, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2520 + }, + { + "epoch": 2.769567597153804, + "grad_norm": 0.5529953837394714, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 2530 + }, + { + "epoch": 2.7805145046524355, + "grad_norm": 0.6731224060058594, + "learning_rate": 0.0002, + "loss": 0.8881, + "step": 2540 + }, + { + "epoch": 2.7914614121510675, + "grad_norm": 0.6960386633872986, + "learning_rate": 0.0002, + "loss": 0.9125, + "step": 2550 + }, + { + "epoch": 2.802408319649699, + "grad_norm": 0.5203493237495422, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 2560 + }, + { + "epoch": 2.813355227148331, + "grad_norm": 1.036837100982666, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 2570 + }, + { + "epoch": 2.8243021346469623, + "grad_norm": 0.6125805377960205, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 2580 + }, + { + "epoch": 2.835249042145594, + "grad_norm": 0.6298092603683472, + "learning_rate": 0.0002, + "loss": 0.8939, + "step": 2590 + }, + { + "epoch": 2.8461959496442253, + "grad_norm": 0.5882203578948975, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2600 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.8619399666786194, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 2610 + }, + { + "epoch": 2.8680897646414887, + "grad_norm": 0.4722687304019928, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 2620 + }, + { + "epoch": 2.8790366721401206, + "grad_norm": 0.47399574518203735, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 2630 + }, + { + "epoch": 2.889983579638752, + "grad_norm": 0.5639172792434692, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2640 + }, + { + "epoch": 2.9009304871373836, + "grad_norm": 0.4676816761493683, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 2650 + }, + { + "epoch": 2.9118773946360155, + "grad_norm": 0.6906291246414185, + "learning_rate": 0.0002, + "loss": 0.8966, + "step": 2660 + }, + { + "epoch": 2.922824302134647, + "grad_norm": 0.4369746148586273, + "learning_rate": 0.0002, + "loss": 0.8966, + "step": 2670 + }, + { + "epoch": 2.933771209633279, + "grad_norm": 0.46423083543777466, + "learning_rate": 0.0002, + "loss": 0.9173, + "step": 2680 + }, + { + "epoch": 2.9447181171319103, + "grad_norm": 0.5700525045394897, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 2690 + }, + { + "epoch": 2.955665024630542, + "grad_norm": 0.6221476793289185, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 2700 + }, + { + "epoch": 2.9666119321291733, + "grad_norm": 0.6102682948112488, + "learning_rate": 0.0002, + "loss": 0.8538, + "step": 2710 + }, + { + "epoch": 2.977558839627805, + "grad_norm": 0.5317878723144531, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 2720 + }, + { + "epoch": 2.9885057471264367, + "grad_norm": 0.4438510835170746, + "learning_rate": 0.0002, + "loss": 0.9314, + "step": 2730 + }, + { + "epoch": 2.9994526546250686, + "grad_norm": 0.5022130012512207, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 2740 + }, + { + "epoch": 2.9994526546250686, + "eval_loss": 1.1722838878631592, + "eval_runtime": 46.0829, + "eval_samples_per_second": 9.461, + "eval_steps_per_second": 1.194, + "step": 2740 + }, + { + "epoch": 3.0103995621237, + "grad_norm": 0.6384502053260803, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 2750 + }, + { + "epoch": 3.0213464696223316, + "grad_norm": 0.9928722381591797, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 2760 + }, + { + "epoch": 3.0322933771209635, + "grad_norm": 0.7813051342964172, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 2770 + }, + { + "epoch": 3.043240284619595, + "grad_norm": 1.0202556848526, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 2780 + }, + { + "epoch": 3.0541871921182264, + "grad_norm": 0.7581062316894531, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 2790 + }, + { + "epoch": 3.0651340996168583, + "grad_norm": 0.6252710223197937, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 2800 + }, + { + "epoch": 3.07608100711549, + "grad_norm": 0.7738662958145142, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 2810 + }, + { + "epoch": 3.0870279146141213, + "grad_norm": 0.7381885051727295, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 2820 + }, + { + "epoch": 3.097974822112753, + "grad_norm": 0.9197564721107483, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 2830 + }, + { + "epoch": 3.1089217296113847, + "grad_norm": 1.000976800918579, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 2840 + }, + { + "epoch": 3.1198686371100166, + "grad_norm": 0.7559131383895874, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 2850 + }, + { + "epoch": 3.130815544608648, + "grad_norm": 0.7213780879974365, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 2860 + }, + { + "epoch": 3.1417624521072796, + "grad_norm": 0.945939838886261, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 2870 + }, + { + "epoch": 3.1527093596059115, + "grad_norm": 0.7277454137802124, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 2880 + }, + { + "epoch": 3.163656267104543, + "grad_norm": 0.762026846408844, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 2890 + }, + { + "epoch": 3.1746031746031744, + "grad_norm": 0.6471221446990967, + "learning_rate": 0.0002, + "loss": 0.5332, + "step": 2900 + }, + { + "epoch": 3.1855500821018063, + "grad_norm": 0.6018978357315063, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 2910 + }, + { + "epoch": 3.196496989600438, + "grad_norm": 0.8607320785522461, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 2920 + }, + { + "epoch": 3.2074438970990693, + "grad_norm": 0.8854126334190369, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 2930 + }, + { + "epoch": 3.218390804597701, + "grad_norm": 0.6620870232582092, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 2940 + }, + { + "epoch": 3.2293377120963327, + "grad_norm": 0.7377511858940125, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 2950 + }, + { + "epoch": 3.2402846195949646, + "grad_norm": 0.7803301811218262, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 2960 + }, + { + "epoch": 3.251231527093596, + "grad_norm": 0.834061861038208, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 2970 + }, + { + "epoch": 3.2621784345922276, + "grad_norm": 0.8496041893959045, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 2980 + }, + { + "epoch": 3.2731253420908595, + "grad_norm": 0.7967984676361084, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 2990 + }, + { + "epoch": 3.284072249589491, + "grad_norm": 1.0207016468048096, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 3000 + }, + { + "epoch": 3.2950191570881224, + "grad_norm": 0.7855771780014038, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 3010 + }, + { + "epoch": 3.3059660645867543, + "grad_norm": 0.9427546858787537, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 3020 + }, + { + "epoch": 3.316912972085386, + "grad_norm": 0.823542594909668, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 3030 + }, + { + "epoch": 3.3278598795840173, + "grad_norm": 0.9826635122299194, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 3040 + }, + { + "epoch": 3.338806787082649, + "grad_norm": 0.7259827852249146, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 3050 + }, + { + "epoch": 3.3497536945812807, + "grad_norm": 0.7774739861488342, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 3060 + }, + { + "epoch": 3.3607006020799126, + "grad_norm": 0.7394293546676636, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 3070 + }, + { + "epoch": 3.371647509578544, + "grad_norm": 0.9017578959465027, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 3080 + }, + { + "epoch": 3.3825944170771756, + "grad_norm": 0.7451054453849792, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 3090 + }, + { + "epoch": 3.3935413245758075, + "grad_norm": 0.7321506142616272, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 3100 + }, + { + "epoch": 3.404488232074439, + "grad_norm": 0.6721828579902649, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 3110 + }, + { + "epoch": 3.4154351395730704, + "grad_norm": 0.774022102355957, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 3120 + }, + { + "epoch": 3.4263820470717024, + "grad_norm": 0.9143537282943726, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 3130 + }, + { + "epoch": 3.437328954570334, + "grad_norm": 1.226087212562561, + "learning_rate": 0.0002, + "loss": 0.6899, + "step": 3140 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 0.7545496225357056, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 3150 + }, + { + "epoch": 3.4592227695675972, + "grad_norm": 0.6515635848045349, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 3160 + }, + { + "epoch": 3.4701696770662287, + "grad_norm": 0.9297090172767639, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 3170 + }, + { + "epoch": 3.4811165845648606, + "grad_norm": 1.0130730867385864, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 3180 + }, + { + "epoch": 3.492063492063492, + "grad_norm": 0.7654589414596558, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 3190 + }, + { + "epoch": 3.5030103995621236, + "grad_norm": 0.9954977631568909, + "learning_rate": 0.0002, + "loss": 0.7401, + "step": 3200 + }, + { + "epoch": 3.5139573070607555, + "grad_norm": 0.6027487516403198, + "learning_rate": 0.0002, + "loss": 0.6661, + "step": 3210 + }, + { + "epoch": 3.524904214559387, + "grad_norm": 0.741770327091217, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 3220 + }, + { + "epoch": 3.535851122058019, + "grad_norm": 1.0534909963607788, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 3230 + }, + { + "epoch": 3.5467980295566504, + "grad_norm": 0.937772274017334, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 3240 + }, + { + "epoch": 3.557744937055282, + "grad_norm": 0.8504213690757751, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 3250 + }, + { + "epoch": 3.5686918445539133, + "grad_norm": 0.7755007147789001, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 3260 + }, + { + "epoch": 3.5796387520525452, + "grad_norm": 1.0193358659744263, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 3270 + }, + { + "epoch": 3.5905856595511767, + "grad_norm": 0.8440536856651306, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 3280 + }, + { + "epoch": 3.6015325670498086, + "grad_norm": 0.6195939183235168, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 3290 + }, + { + "epoch": 3.61247947454844, + "grad_norm": 0.8608590960502625, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 3300 + }, + { + "epoch": 3.6234263820470716, + "grad_norm": 0.6772327423095703, + "learning_rate": 0.0002, + "loss": 0.7301, + "step": 3310 + }, + { + "epoch": 3.6343732895457035, + "grad_norm": 0.8031839728355408, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 3320 + }, + { + "epoch": 3.645320197044335, + "grad_norm": 0.6080502271652222, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 3330 + }, + { + "epoch": 3.656267104542967, + "grad_norm": 0.8007240891456604, + "learning_rate": 0.0002, + "loss": 0.7431, + "step": 3340 + }, + { + "epoch": 3.6672140120415984, + "grad_norm": 0.8060704469680786, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 3350 + }, + { + "epoch": 3.67816091954023, + "grad_norm": 0.7547586560249329, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 3360 + }, + { + "epoch": 3.6891078270388613, + "grad_norm": 0.686851978302002, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 3370 + }, + { + "epoch": 3.7000547345374932, + "grad_norm": 0.9429075717926025, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 3380 + }, + { + "epoch": 3.7110016420361247, + "grad_norm": 0.7283591032028198, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 3390 + }, + { + "epoch": 3.7219485495347566, + "grad_norm": 0.8323085904121399, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 3400 + }, + { + "epoch": 3.732895457033388, + "grad_norm": 0.8529590964317322, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 3410 + }, + { + "epoch": 3.7438423645320196, + "grad_norm": 0.731752872467041, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 3420 + }, + { + "epoch": 3.7547892720306515, + "grad_norm": 0.8572278618812561, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3430 + }, + { + "epoch": 3.765736179529283, + "grad_norm": 0.7408691048622131, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 3440 + }, + { + "epoch": 3.776683087027915, + "grad_norm": 0.7470445036888123, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 3450 + }, + { + "epoch": 3.7876299945265464, + "grad_norm": 0.6806244254112244, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 3460 + }, + { + "epoch": 3.798576902025178, + "grad_norm": 0.9129069447517395, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 3470 + }, + { + "epoch": 3.8095238095238093, + "grad_norm": 0.8717501759529114, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 3480 + }, + { + "epoch": 3.8204707170224412, + "grad_norm": 0.6761979460716248, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 3490 + }, + { + "epoch": 3.8314176245210727, + "grad_norm": 1.0054380893707275, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 3500 + }, + { + "epoch": 3.8423645320197046, + "grad_norm": 1.1224009990692139, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 3510 + }, + { + "epoch": 3.853311439518336, + "grad_norm": 0.8997692465782166, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 3520 + }, + { + "epoch": 3.8642583470169676, + "grad_norm": 1.0086902379989624, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 3530 + }, + { + "epoch": 3.8752052545155995, + "grad_norm": 0.772739589214325, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 3540 + }, + { + "epoch": 3.886152162014231, + "grad_norm": 1.211774230003357, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 3550 + }, + { + "epoch": 3.897099069512863, + "grad_norm": 0.9572356939315796, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 3560 + }, + { + "epoch": 3.9080459770114944, + "grad_norm": 0.7887842655181885, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 3570 + }, + { + "epoch": 3.918992884510126, + "grad_norm": 0.7308389544487, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 3580 + }, + { + "epoch": 3.9299397920087573, + "grad_norm": 1.0182650089263916, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 3590 + }, + { + "epoch": 3.9408866995073892, + "grad_norm": 0.8000147342681885, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 3600 + }, + { + "epoch": 3.9518336070060207, + "grad_norm": 0.7385728359222412, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 3610 + }, + { + "epoch": 3.9627805145046526, + "grad_norm": 0.9233261942863464, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 3620 + }, + { + "epoch": 3.973727422003284, + "grad_norm": 0.8486751914024353, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 3630 + }, + { + "epoch": 3.9846743295019156, + "grad_norm": 0.7593663334846497, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3640 + }, + { + "epoch": 3.9956212370005475, + "grad_norm": 0.7885415554046631, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 3650 + }, + { + "epoch": 4.0, + "eval_loss": 1.250312328338623, + "eval_runtime": 46.0842, + "eval_samples_per_second": 9.461, + "eval_steps_per_second": 1.193, + "step": 3654 + }, + { + "epoch": 4.006568144499179, + "grad_norm": 0.6591703295707703, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 3660 + }, + { + "epoch": 4.017515051997811, + "grad_norm": 1.36927330493927, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 3670 + }, + { + "epoch": 4.028461959496442, + "grad_norm": 0.8106328845024109, + "learning_rate": 0.0002, + "loss": 0.4466, + "step": 3680 + }, + { + "epoch": 4.039408866995074, + "grad_norm": 0.7592712044715881, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 3690 + }, + { + "epoch": 4.050355774493705, + "grad_norm": 0.9518909454345703, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 3700 + }, + { + "epoch": 4.061302681992337, + "grad_norm": 0.7805967330932617, + "learning_rate": 0.0002, + "loss": 0.4638, + "step": 3710 + }, + { + "epoch": 4.072249589490969, + "grad_norm": 1.3146334886550903, + "learning_rate": 0.0002, + "loss": 0.4556, + "step": 3720 + }, + { + "epoch": 4.083196496989601, + "grad_norm": 1.1611138582229614, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 3730 + }, + { + "epoch": 4.094143404488232, + "grad_norm": 0.8173232078552246, + "learning_rate": 0.0002, + "loss": 0.3845, + "step": 3740 + }, + { + "epoch": 4.105090311986864, + "grad_norm": 0.7848323583602905, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 3750 + }, + { + "epoch": 4.116037219485495, + "grad_norm": 1.3183201551437378, + "learning_rate": 0.0002, + "loss": 0.4519, + "step": 3760 + }, + { + "epoch": 4.1269841269841265, + "grad_norm": 1.1936529874801636, + "learning_rate": 0.0002, + "loss": 0.5083, + "step": 3770 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 1.1078993082046509, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 3780 + }, + { + "epoch": 4.14887794198139, + "grad_norm": 1.107743263244629, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 3790 + }, + { + "epoch": 4.159824849480022, + "grad_norm": 0.7801875472068787, + "learning_rate": 0.0002, + "loss": 0.5112, + "step": 3800 + }, + { + "epoch": 4.170771756978653, + "grad_norm": 1.1328117847442627, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 3810 + }, + { + "epoch": 4.181718664477285, + "grad_norm": 1.4232193231582642, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 3820 + }, + { + "epoch": 4.192665571975917, + "grad_norm": 1.557416558265686, + "learning_rate": 0.0002, + "loss": 0.5049, + "step": 3830 + }, + { + "epoch": 4.203612479474549, + "grad_norm": 1.042923092842102, + "learning_rate": 0.0002, + "loss": 0.4863, + "step": 3840 + }, + { + "epoch": 4.21455938697318, + "grad_norm": 1.1801949739456177, + "learning_rate": 0.0002, + "loss": 0.3751, + "step": 3850 + }, + { + "epoch": 4.225506294471812, + "grad_norm": 0.9273753762245178, + "learning_rate": 0.0002, + "loss": 0.5063, + "step": 3860 + }, + { + "epoch": 4.236453201970443, + "grad_norm": 0.7681763768196106, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 3870 + }, + { + "epoch": 4.2474001094690745, + "grad_norm": 0.9840841293334961, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 3880 + }, + { + "epoch": 4.258347016967707, + "grad_norm": 1.0290725231170654, + "learning_rate": 0.0002, + "loss": 0.4648, + "step": 3890 + }, + { + "epoch": 4.269293924466338, + "grad_norm": 0.8059597611427307, + "learning_rate": 0.0002, + "loss": 0.4288, + "step": 3900 + }, + { + "epoch": 4.28024083196497, + "grad_norm": 0.9847467541694641, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 3910 + }, + { + "epoch": 4.291187739463601, + "grad_norm": 1.344044804573059, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 3920 + }, + { + "epoch": 4.302134646962233, + "grad_norm": 0.9174224138259888, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 3930 + }, + { + "epoch": 4.313081554460865, + "grad_norm": 1.1199711561203003, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 3940 + }, + { + "epoch": 4.324028461959497, + "grad_norm": 1.0120296478271484, + "learning_rate": 0.0002, + "loss": 0.4641, + "step": 3950 + }, + { + "epoch": 4.334975369458128, + "grad_norm": 1.091811180114746, + "learning_rate": 0.0002, + "loss": 0.4723, + "step": 3960 + }, + { + "epoch": 4.34592227695676, + "grad_norm": 1.0332133769989014, + "learning_rate": 0.0002, + "loss": 0.4627, + "step": 3970 + }, + { + "epoch": 4.356869184455391, + "grad_norm": 1.0785295963287354, + "learning_rate": 0.0002, + "loss": 0.4646, + "step": 3980 + }, + { + "epoch": 4.3678160919540225, + "grad_norm": 1.0506969690322876, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 3990 + }, + { + "epoch": 4.378762999452655, + "grad_norm": 1.047560691833496, + "learning_rate": 0.0002, + "loss": 0.4776, + "step": 4000 + }, + { + "epoch": 4.389709906951286, + "grad_norm": 0.9348800778388977, + "learning_rate": 0.0002, + "loss": 0.4549, + "step": 4010 + }, + { + "epoch": 4.400656814449918, + "grad_norm": 1.1563059091567993, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 4020 + }, + { + "epoch": 4.411603721948549, + "grad_norm": 1.001470923423767, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 4030 + }, + { + "epoch": 4.422550629447181, + "grad_norm": 1.309012532234192, + "learning_rate": 0.0002, + "loss": 0.4972, + "step": 4040 + }, + { + "epoch": 4.433497536945813, + "grad_norm": 0.7338925004005432, + "learning_rate": 0.0002, + "loss": 0.5078, + "step": 4050 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 1.0398834943771362, + "learning_rate": 0.0002, + "loss": 0.4632, + "step": 4060 + }, + { + "epoch": 4.455391351943076, + "grad_norm": 0.9728689193725586, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 4070 + }, + { + "epoch": 4.466338259441708, + "grad_norm": 1.247475028038025, + "learning_rate": 0.0002, + "loss": 0.4778, + "step": 4080 + }, + { + "epoch": 4.477285166940339, + "grad_norm": 1.1084578037261963, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 4090 + }, + { + "epoch": 4.4882320744389705, + "grad_norm": 1.1619318723678589, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 4100 + }, + { + "epoch": 4.499178981937603, + "grad_norm": 1.3456498384475708, + "learning_rate": 0.0002, + "loss": 0.5207, + "step": 4110 + }, + { + "epoch": 4.510125889436234, + "grad_norm": 0.9372991323471069, + "learning_rate": 0.0002, + "loss": 0.4876, + "step": 4120 + }, + { + "epoch": 4.521072796934866, + "grad_norm": 1.0071815252304077, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 4130 + }, + { + "epoch": 4.532019704433497, + "grad_norm": 1.190344214439392, + "learning_rate": 0.0002, + "loss": 0.5589, + "step": 4140 + }, + { + "epoch": 4.542966611932129, + "grad_norm": 0.9480887055397034, + "learning_rate": 0.0002, + "loss": 0.4852, + "step": 4150 + }, + { + "epoch": 4.553913519430761, + "grad_norm": 1.0252189636230469, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 4160 + }, + { + "epoch": 4.564860426929393, + "grad_norm": 0.7142013311386108, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 4170 + }, + { + "epoch": 4.575807334428024, + "grad_norm": 0.8937426805496216, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 4180 + }, + { + "epoch": 4.586754241926656, + "grad_norm": 0.8885005116462708, + "learning_rate": 0.0002, + "loss": 0.4773, + "step": 4190 + }, + { + "epoch": 4.597701149425287, + "grad_norm": 1.337663173675537, + "learning_rate": 0.0002, + "loss": 0.4858, + "step": 4200 + }, + { + "epoch": 4.6086480569239185, + "grad_norm": 1.0475375652313232, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 4210 + }, + { + "epoch": 4.619594964422551, + "grad_norm": 1.0081088542938232, + "learning_rate": 0.0002, + "loss": 0.5298, + "step": 4220 + }, + { + "epoch": 4.630541871921182, + "grad_norm": 0.7527595162391663, + "learning_rate": 0.0002, + "loss": 0.5042, + "step": 4230 + }, + { + "epoch": 4.641488779419814, + "grad_norm": 1.55559241771698, + "learning_rate": 0.0002, + "loss": 0.5207, + "step": 4240 + }, + { + "epoch": 4.652435686918445, + "grad_norm": 0.7967379689216614, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 4250 + }, + { + "epoch": 4.663382594417077, + "grad_norm": 0.898368775844574, + "learning_rate": 0.0002, + "loss": 0.5328, + "step": 4260 + }, + { + "epoch": 4.674329501915709, + "grad_norm": 1.1940776109695435, + "learning_rate": 0.0002, + "loss": 0.4706, + "step": 4270 + }, + { + "epoch": 4.685276409414341, + "grad_norm": 1.1817092895507812, + "learning_rate": 0.0002, + "loss": 0.5121, + "step": 4280 + }, + { + "epoch": 4.696223316912972, + "grad_norm": 0.9041520357131958, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 4290 + }, + { + "epoch": 4.707170224411604, + "grad_norm": 1.1280102729797363, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 4300 + }, + { + "epoch": 4.718117131910235, + "grad_norm": 1.357689619064331, + "learning_rate": 0.0002, + "loss": 0.4891, + "step": 4310 + }, + { + "epoch": 4.7290640394088665, + "grad_norm": 1.056633472442627, + "learning_rate": 0.0002, + "loss": 0.4704, + "step": 4320 + }, + { + "epoch": 4.740010946907499, + "grad_norm": 1.6520427465438843, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 4330 + }, + { + "epoch": 4.75095785440613, + "grad_norm": 1.153200626373291, + "learning_rate": 0.0002, + "loss": 0.5131, + "step": 4340 + }, + { + "epoch": 4.761904761904762, + "grad_norm": 0.9346241354942322, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 4350 + }, + { + "epoch": 4.772851669403393, + "grad_norm": 0.8628455996513367, + "learning_rate": 0.0002, + "loss": 0.4941, + "step": 4360 + }, + { + "epoch": 4.783798576902025, + "grad_norm": 1.3843916654586792, + "learning_rate": 0.0002, + "loss": 0.5167, + "step": 4370 + }, + { + "epoch": 4.794745484400657, + "grad_norm": 1.035574197769165, + "learning_rate": 0.0002, + "loss": 0.4683, + "step": 4380 + }, + { + "epoch": 4.805692391899289, + "grad_norm": 1.1868361234664917, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 4390 + }, + { + "epoch": 4.81663929939792, + "grad_norm": 1.1307647228240967, + "learning_rate": 0.0002, + "loss": 0.534, + "step": 4400 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 0.9787724614143372, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 4410 + }, + { + "epoch": 4.838533114395183, + "grad_norm": 1.0473824739456177, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 4420 + }, + { + "epoch": 4.8494800218938146, + "grad_norm": 1.069069504737854, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 4430 + }, + { + "epoch": 4.860426929392447, + "grad_norm": 1.4305680990219116, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 4440 + }, + { + "epoch": 4.871373836891078, + "grad_norm": 1.3679203987121582, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 4450 + }, + { + "epoch": 4.88232074438971, + "grad_norm": 0.8997844457626343, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 4460 + }, + { + "epoch": 4.893267651888341, + "grad_norm": 1.2758110761642456, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 4470 + }, + { + "epoch": 4.904214559386973, + "grad_norm": 0.8819465637207031, + "learning_rate": 0.0002, + "loss": 0.4914, + "step": 4480 + }, + { + "epoch": 4.915161466885605, + "grad_norm": 1.08329439163208, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 4490 + }, + { + "epoch": 4.926108374384237, + "grad_norm": 1.083461046218872, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 4500 + }, + { + "epoch": 4.937055281882868, + "grad_norm": 1.2387723922729492, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 4510 + }, + { + "epoch": 4.9480021893815, + "grad_norm": 0.8262293934822083, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 4520 + }, + { + "epoch": 4.958949096880131, + "grad_norm": 1.2325191497802734, + "learning_rate": 0.0002, + "loss": 0.504, + "step": 4530 + }, + { + "epoch": 4.9698960043787626, + "grad_norm": 1.024614930152893, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 4540 + }, + { + "epoch": 4.980842911877395, + "grad_norm": 1.3007521629333496, + "learning_rate": 0.0002, + "loss": 0.4752, + "step": 4550 + }, + { + "epoch": 4.991789819376026, + "grad_norm": 0.9823828339576721, + "learning_rate": 0.0002, + "loss": 0.4943, + "step": 4560 + }, + { + "epoch": 4.999452654625069, + "eval_loss": 1.3920727968215942, + "eval_runtime": 46.0764, + "eval_samples_per_second": 9.463, + "eval_steps_per_second": 1.194, + "step": 4567 + }, + { + "epoch": 5.002736726874658, + "grad_norm": 1.1478906869888306, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 4570 + }, + { + "epoch": 5.013683634373289, + "grad_norm": 1.0533705949783325, + "learning_rate": 0.0002, + "loss": 0.372, + "step": 4580 + }, + { + "epoch": 5.024630541871921, + "grad_norm": 1.268900752067566, + "learning_rate": 0.0002, + "loss": 0.3313, + "step": 4590 + }, + { + "epoch": 5.035577449370553, + "grad_norm": 1.222652554512024, + "learning_rate": 0.0002, + "loss": 0.3482, + "step": 4600 + }, + { + "epoch": 5.046524356869185, + "grad_norm": 1.5093127489089966, + "learning_rate": 0.0002, + "loss": 0.3195, + "step": 4610 + }, + { + "epoch": 5.057471264367816, + "grad_norm": 1.2372499704360962, + "learning_rate": 0.0002, + "loss": 0.3569, + "step": 4620 + }, + { + "epoch": 5.068418171866448, + "grad_norm": 0.8422666192054749, + "learning_rate": 0.0002, + "loss": 0.3206, + "step": 4630 + }, + { + "epoch": 5.079365079365079, + "grad_norm": 1.1451770067214966, + "learning_rate": 0.0002, + "loss": 0.3115, + "step": 4640 + }, + { + "epoch": 5.090311986863711, + "grad_norm": 1.2074557542800903, + "learning_rate": 0.0002, + "loss": 0.3305, + "step": 4650 + }, + { + "epoch": 5.101258894362343, + "grad_norm": 1.429150104522705, + "learning_rate": 0.0002, + "loss": 0.3012, + "step": 4660 + }, + { + "epoch": 5.112205801860974, + "grad_norm": 1.0353610515594482, + "learning_rate": 0.0002, + "loss": 0.3229, + "step": 4670 + }, + { + "epoch": 5.123152709359606, + "grad_norm": 1.2845979928970337, + "learning_rate": 0.0002, + "loss": 0.402, + "step": 4680 + }, + { + "epoch": 5.134099616858237, + "grad_norm": 1.3790186643600464, + "learning_rate": 0.0002, + "loss": 0.383, + "step": 4690 + }, + { + "epoch": 5.145046524356869, + "grad_norm": 1.3182239532470703, + "learning_rate": 0.0002, + "loss": 0.2951, + "step": 4700 + }, + { + "epoch": 5.155993431855501, + "grad_norm": 1.5249626636505127, + "learning_rate": 0.0002, + "loss": 0.4074, + "step": 4710 + }, + { + "epoch": 5.166940339354133, + "grad_norm": 1.2492733001708984, + "learning_rate": 0.0002, + "loss": 0.3703, + "step": 4720 + }, + { + "epoch": 5.177887246852764, + "grad_norm": 1.4455480575561523, + "learning_rate": 0.0002, + "loss": 0.3411, + "step": 4730 + }, + { + "epoch": 5.188834154351396, + "grad_norm": 1.2191482782363892, + "learning_rate": 0.0002, + "loss": 0.3996, + "step": 4740 + }, + { + "epoch": 5.199781061850027, + "grad_norm": 1.4707951545715332, + "learning_rate": 0.0002, + "loss": 0.3785, + "step": 4750 + }, + { + "epoch": 5.210727969348659, + "grad_norm": 1.3473678827285767, + "learning_rate": 0.0002, + "loss": 0.3516, + "step": 4760 + }, + { + "epoch": 5.221674876847291, + "grad_norm": 1.0479670763015747, + "learning_rate": 0.0002, + "loss": 0.3266, + "step": 4770 + }, + { + "epoch": 5.232621784345922, + "grad_norm": 1.299096703529358, + "learning_rate": 0.0002, + "loss": 0.3976, + "step": 4780 + }, + { + "epoch": 5.243568691844554, + "grad_norm": 1.2820168733596802, + "learning_rate": 0.0002, + "loss": 0.3266, + "step": 4790 + }, + { + "epoch": 5.254515599343185, + "grad_norm": 1.3818004131317139, + "learning_rate": 0.0002, + "loss": 0.3347, + "step": 4800 + }, + { + "epoch": 5.265462506841817, + "grad_norm": 1.2898736000061035, + "learning_rate": 0.0002, + "loss": 0.3761, + "step": 4810 + }, + { + "epoch": 5.276409414340449, + "grad_norm": 1.1761468648910522, + "learning_rate": 0.0002, + "loss": 0.3694, + "step": 4820 + }, + { + "epoch": 5.287356321839081, + "grad_norm": 1.7155952453613281, + "learning_rate": 0.0002, + "loss": 0.3806, + "step": 4830 + }, + { + "epoch": 5.298303229337712, + "grad_norm": 0.9103642106056213, + "learning_rate": 0.0002, + "loss": 0.322, + "step": 4840 + }, + { + "epoch": 5.309250136836344, + "grad_norm": 1.013015627861023, + "learning_rate": 0.0002, + "loss": 0.3516, + "step": 4850 + }, + { + "epoch": 5.320197044334975, + "grad_norm": 1.390471339225769, + "learning_rate": 0.0002, + "loss": 0.4297, + "step": 4860 + }, + { + "epoch": 5.331143951833607, + "grad_norm": 1.129770278930664, + "learning_rate": 0.0002, + "loss": 0.4098, + "step": 4870 + }, + { + "epoch": 5.342090859332239, + "grad_norm": 1.1461067199707031, + "learning_rate": 0.0002, + "loss": 0.4227, + "step": 4880 + }, + { + "epoch": 5.35303776683087, + "grad_norm": 1.3587424755096436, + "learning_rate": 0.0002, + "loss": 0.288, + "step": 4890 + }, + { + "epoch": 5.363984674329502, + "grad_norm": 1.6897879838943481, + "learning_rate": 0.0002, + "loss": 0.3604, + "step": 4900 + }, + { + "epoch": 5.374931581828133, + "grad_norm": 0.9298055768013, + "learning_rate": 0.0002, + "loss": 0.3887, + "step": 4910 + }, + { + "epoch": 5.385878489326765, + "grad_norm": 1.0006917715072632, + "learning_rate": 0.0002, + "loss": 0.3371, + "step": 4920 + }, + { + "epoch": 5.396825396825397, + "grad_norm": 1.232581377029419, + "learning_rate": 0.0002, + "loss": 0.3992, + "step": 4930 + }, + { + "epoch": 5.407772304324029, + "grad_norm": 1.0822620391845703, + "learning_rate": 0.0002, + "loss": 0.3456, + "step": 4940 + }, + { + "epoch": 5.41871921182266, + "grad_norm": 1.3648720979690552, + "learning_rate": 0.0002, + "loss": 0.3806, + "step": 4950 + }, + { + "epoch": 5.429666119321292, + "grad_norm": 1.3220354318618774, + "learning_rate": 0.0002, + "loss": 0.3959, + "step": 4960 + }, + { + "epoch": 5.440613026819923, + "grad_norm": 1.1106271743774414, + "learning_rate": 0.0002, + "loss": 0.3278, + "step": 4970 + }, + { + "epoch": 5.451559934318555, + "grad_norm": 1.6058908700942993, + "learning_rate": 0.0002, + "loss": 0.3812, + "step": 4980 + }, + { + "epoch": 5.462506841817187, + "grad_norm": 1.1065930128097534, + "learning_rate": 0.0002, + "loss": 0.3905, + "step": 4990 + }, + { + "epoch": 5.473453749315818, + "grad_norm": 1.3896466493606567, + "learning_rate": 0.0002, + "loss": 0.4058, + "step": 5000 + }, + { + "epoch": 5.48440065681445, + "grad_norm": 1.0437148809432983, + "learning_rate": 0.0002, + "loss": 0.4122, + "step": 5010 + }, + { + "epoch": 5.495347564313081, + "grad_norm": 1.2347718477249146, + "learning_rate": 0.0002, + "loss": 0.4065, + "step": 5020 + }, + { + "epoch": 5.506294471811713, + "grad_norm": 1.1174284219741821, + "learning_rate": 0.0002, + "loss": 0.3586, + "step": 5030 + }, + { + "epoch": 5.517241379310345, + "grad_norm": 1.2580941915512085, + "learning_rate": 0.0002, + "loss": 0.3576, + "step": 5040 + }, + { + "epoch": 5.528188286808977, + "grad_norm": 1.451090931892395, + "learning_rate": 0.0002, + "loss": 0.3809, + "step": 5050 + }, + { + "epoch": 5.539135194307608, + "grad_norm": 1.4688365459442139, + "learning_rate": 0.0002, + "loss": 0.3645, + "step": 5060 + }, + { + "epoch": 5.55008210180624, + "grad_norm": 1.1625734567642212, + "learning_rate": 0.0002, + "loss": 0.4431, + "step": 5070 + }, + { + "epoch": 5.561029009304871, + "grad_norm": 0.9332265257835388, + "learning_rate": 0.0002, + "loss": 0.3972, + "step": 5080 + }, + { + "epoch": 5.571975916803503, + "grad_norm": 1.5635273456573486, + "learning_rate": 0.0002, + "loss": 0.4, + "step": 5090 + }, + { + "epoch": 5.582922824302135, + "grad_norm": 1.3420509099960327, + "learning_rate": 0.0002, + "loss": 0.3651, + "step": 5100 + }, + { + "epoch": 5.593869731800766, + "grad_norm": 1.5826557874679565, + "learning_rate": 0.0002, + "loss": 0.3717, + "step": 5110 + }, + { + "epoch": 5.604816639299398, + "grad_norm": 1.5737065076828003, + "learning_rate": 0.0002, + "loss": 0.4256, + "step": 5120 + }, + { + "epoch": 5.615763546798029, + "grad_norm": 1.3812499046325684, + "learning_rate": 0.0002, + "loss": 0.39, + "step": 5130 + }, + { + "epoch": 5.626710454296661, + "grad_norm": 1.362833023071289, + "learning_rate": 0.0002, + "loss": 0.3891, + "step": 5140 + }, + { + "epoch": 5.637657361795293, + "grad_norm": 1.7667874097824097, + "learning_rate": 0.0002, + "loss": 0.455, + "step": 5150 + }, + { + "epoch": 5.648604269293925, + "grad_norm": 1.2661789655685425, + "learning_rate": 0.0002, + "loss": 0.4264, + "step": 5160 + }, + { + "epoch": 5.659551176792556, + "grad_norm": 1.2076870203018188, + "learning_rate": 0.0002, + "loss": 0.3261, + "step": 5170 + }, + { + "epoch": 5.670498084291188, + "grad_norm": 1.2431524991989136, + "learning_rate": 0.0002, + "loss": 0.372, + "step": 5180 + }, + { + "epoch": 5.681444991789819, + "grad_norm": 1.2216639518737793, + "learning_rate": 0.0002, + "loss": 0.4092, + "step": 5190 + }, + { + "epoch": 5.692391899288451, + "grad_norm": 0.9259352684020996, + "learning_rate": 0.0002, + "loss": 0.4171, + "step": 5200 + }, + { + "epoch": 5.703338806787083, + "grad_norm": 1.7929338216781616, + "learning_rate": 0.0002, + "loss": 0.3875, + "step": 5210 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 1.4048460721969604, + "learning_rate": 0.0002, + "loss": 0.4424, + "step": 5220 + }, + { + "epoch": 5.725232621784346, + "grad_norm": 1.306874394416809, + "learning_rate": 0.0002, + "loss": 0.3758, + "step": 5230 + }, + { + "epoch": 5.736179529282977, + "grad_norm": 1.3137940168380737, + "learning_rate": 0.0002, + "loss": 0.3889, + "step": 5240 + }, + { + "epoch": 5.747126436781609, + "grad_norm": 1.1376476287841797, + "learning_rate": 0.0002, + "loss": 0.4804, + "step": 5250 + }, + { + "epoch": 5.758073344280241, + "grad_norm": 1.450939416885376, + "learning_rate": 0.0002, + "loss": 0.377, + "step": 5260 + }, + { + "epoch": 5.769020251778873, + "grad_norm": 0.983195960521698, + "learning_rate": 0.0002, + "loss": 0.4732, + "step": 5270 + }, + { + "epoch": 5.779967159277504, + "grad_norm": 1.66558837890625, + "learning_rate": 0.0002, + "loss": 0.4041, + "step": 5280 + }, + { + "epoch": 5.790914066776136, + "grad_norm": 0.9789204597473145, + "learning_rate": 0.0002, + "loss": 0.3643, + "step": 5290 + }, + { + "epoch": 5.801860974274767, + "grad_norm": 1.2110556364059448, + "learning_rate": 0.0002, + "loss": 0.3776, + "step": 5300 + }, + { + "epoch": 5.812807881773399, + "grad_norm": 1.3799304962158203, + "learning_rate": 0.0002, + "loss": 0.4049, + "step": 5310 + }, + { + "epoch": 5.823754789272031, + "grad_norm": 1.0570626258850098, + "learning_rate": 0.0002, + "loss": 0.4362, + "step": 5320 + }, + { + "epoch": 5.834701696770662, + "grad_norm": 1.4654436111450195, + "learning_rate": 0.0002, + "loss": 0.4716, + "step": 5330 + }, + { + "epoch": 5.845648604269294, + "grad_norm": 1.5216940641403198, + "learning_rate": 0.0002, + "loss": 0.4048, + "step": 5340 + }, + { + "epoch": 5.856595511767925, + "grad_norm": 1.018646001815796, + "learning_rate": 0.0002, + "loss": 0.3848, + "step": 5350 + }, + { + "epoch": 5.867542419266557, + "grad_norm": 1.028951644897461, + "learning_rate": 0.0002, + "loss": 0.3705, + "step": 5360 + }, + { + "epoch": 5.878489326765189, + "grad_norm": 2.571263313293457, + "learning_rate": 0.0002, + "loss": 0.4213, + "step": 5370 + }, + { + "epoch": 5.889436234263821, + "grad_norm": 1.3323984146118164, + "learning_rate": 0.0002, + "loss": 0.3647, + "step": 5380 + }, + { + "epoch": 5.900383141762452, + "grad_norm": 1.4317777156829834, + "learning_rate": 0.0002, + "loss": 0.4085, + "step": 5390 + }, + { + "epoch": 5.911330049261084, + "grad_norm": 1.4289140701293945, + "learning_rate": 0.0002, + "loss": 0.4254, + "step": 5400 + }, + { + "epoch": 5.922276956759715, + "grad_norm": 1.3130780458450317, + "learning_rate": 0.0002, + "loss": 0.3993, + "step": 5410 + }, + { + "epoch": 5.933223864258347, + "grad_norm": 1.3979902267456055, + "learning_rate": 0.0002, + "loss": 0.4025, + "step": 5420 + }, + { + "epoch": 5.944170771756979, + "grad_norm": 1.1827352046966553, + "learning_rate": 0.0002, + "loss": 0.3997, + "step": 5430 + }, + { + "epoch": 5.95511767925561, + "grad_norm": 1.1672080755233765, + "learning_rate": 0.0002, + "loss": 0.4163, + "step": 5440 + }, + { + "epoch": 5.966064586754242, + "grad_norm": 1.0949620008468628, + "learning_rate": 0.0002, + "loss": 0.4425, + "step": 5450 + }, + { + "epoch": 5.977011494252873, + "grad_norm": 1.3183925151824951, + "learning_rate": 0.0002, + "loss": 0.4219, + "step": 5460 + }, + { + "epoch": 5.987958401751505, + "grad_norm": 1.096198320388794, + "learning_rate": 0.0002, + "loss": 0.4171, + "step": 5470 + }, + { + "epoch": 5.998905309250137, + "grad_norm": 1.2601423263549805, + "learning_rate": 0.0002, + "loss": 0.3886, + "step": 5480 + }, + { + "epoch": 6.0, + "eval_loss": 1.611358880996704, + "eval_runtime": 46.0638, + "eval_samples_per_second": 9.465, + "eval_steps_per_second": 1.194, + "step": 5481 + }, + { + "epoch": 6.009852216748769, + "grad_norm": 0.9854364991188049, + "learning_rate": 0.0002, + "loss": 0.2616, + "step": 5490 + }, + { + "epoch": 6.0207991242474, + "grad_norm": 1.8073689937591553, + "learning_rate": 0.0002, + "loss": 0.2412, + "step": 5500 + }, + { + "epoch": 6.031746031746032, + "grad_norm": 1.1852164268493652, + "learning_rate": 0.0002, + "loss": 0.2317, + "step": 5510 + }, + { + "epoch": 6.042692939244663, + "grad_norm": 1.0937914848327637, + "learning_rate": 0.0002, + "loss": 0.224, + "step": 5520 + }, + { + "epoch": 6.053639846743295, + "grad_norm": 0.7411194443702698, + "learning_rate": 0.0002, + "loss": 0.2473, + "step": 5530 + }, + { + "epoch": 6.064586754241927, + "grad_norm": 1.552127480506897, + "learning_rate": 0.0002, + "loss": 0.2846, + "step": 5540 + }, + { + "epoch": 6.075533661740558, + "grad_norm": 1.0465604066848755, + "learning_rate": 0.0002, + "loss": 0.2639, + "step": 5550 + }, + { + "epoch": 6.08648056923919, + "grad_norm": 1.4008121490478516, + "learning_rate": 0.0002, + "loss": 0.2696, + "step": 5560 + }, + { + "epoch": 6.097427476737821, + "grad_norm": 1.7049046754837036, + "learning_rate": 0.0002, + "loss": 0.3049, + "step": 5570 + }, + { + "epoch": 6.108374384236453, + "grad_norm": 1.111151933670044, + "learning_rate": 0.0002, + "loss": 0.263, + "step": 5580 + }, + { + "epoch": 6.119321291735085, + "grad_norm": 1.4271087646484375, + "learning_rate": 0.0002, + "loss": 0.2816, + "step": 5590 + }, + { + "epoch": 6.130268199233717, + "grad_norm": 1.3917373418807983, + "learning_rate": 0.0002, + "loss": 0.2878, + "step": 5600 + }, + { + "epoch": 6.141215106732348, + "grad_norm": 1.013689637184143, + "learning_rate": 0.0002, + "loss": 0.2482, + "step": 5610 + }, + { + "epoch": 6.15216201423098, + "grad_norm": 1.342645525932312, + "learning_rate": 0.0002, + "loss": 0.2841, + "step": 5620 + }, + { + "epoch": 6.163108921729611, + "grad_norm": 1.4480562210083008, + "learning_rate": 0.0002, + "loss": 0.2335, + "step": 5630 + }, + { + "epoch": 6.174055829228243, + "grad_norm": 1.2483175992965698, + "learning_rate": 0.0002, + "loss": 0.2696, + "step": 5640 + }, + { + "epoch": 6.185002736726875, + "grad_norm": 1.2944550514221191, + "learning_rate": 0.0002, + "loss": 0.2656, + "step": 5650 + }, + { + "epoch": 6.195949644225506, + "grad_norm": 1.264142632484436, + "learning_rate": 0.0002, + "loss": 0.2704, + "step": 5660 + }, + { + "epoch": 6.206896551724138, + "grad_norm": 1.2068781852722168, + "learning_rate": 0.0002, + "loss": 0.2971, + "step": 5670 + }, + { + "epoch": 6.217843459222769, + "grad_norm": 1.0401629209518433, + "learning_rate": 0.0002, + "loss": 0.2882, + "step": 5680 + }, + { + "epoch": 6.228790366721401, + "grad_norm": 1.2054402828216553, + "learning_rate": 0.0002, + "loss": 0.3022, + "step": 5690 + }, + { + "epoch": 6.239737274220033, + "grad_norm": 1.1278687715530396, + "learning_rate": 0.0002, + "loss": 0.2949, + "step": 5700 + }, + { + "epoch": 6.250684181718665, + "grad_norm": 1.24592125415802, + "learning_rate": 0.0002, + "loss": 0.2477, + "step": 5710 + }, + { + "epoch": 6.261631089217296, + "grad_norm": 1.2686697244644165, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 5720 + }, + { + "epoch": 6.272577996715928, + "grad_norm": 1.1836518049240112, + "learning_rate": 0.0002, + "loss": 0.2974, + "step": 5730 + }, + { + "epoch": 6.283524904214559, + "grad_norm": 1.387752890586853, + "learning_rate": 0.0002, + "loss": 0.2963, + "step": 5740 + }, + { + "epoch": 6.294471811713191, + "grad_norm": 1.9390363693237305, + "learning_rate": 0.0002, + "loss": 0.2961, + "step": 5750 + }, + { + "epoch": 6.305418719211823, + "grad_norm": 1.2919824123382568, + "learning_rate": 0.0002, + "loss": 0.2765, + "step": 5760 + }, + { + "epoch": 6.316365626710454, + "grad_norm": 1.2793965339660645, + "learning_rate": 0.0002, + "loss": 0.2898, + "step": 5770 + }, + { + "epoch": 6.327312534209086, + "grad_norm": 1.5486980676651, + "learning_rate": 0.0002, + "loss": 0.2786, + "step": 5780 + }, + { + "epoch": 6.338259441707717, + "grad_norm": 1.2757408618927002, + "learning_rate": 0.0002, + "loss": 0.2684, + "step": 5790 + }, + { + "epoch": 6.349206349206349, + "grad_norm": 1.3245713710784912, + "learning_rate": 0.0002, + "loss": 0.2841, + "step": 5800 + }, + { + "epoch": 6.360153256704981, + "grad_norm": 1.6262527704238892, + "learning_rate": 0.0002, + "loss": 0.3096, + "step": 5810 + }, + { + "epoch": 6.371100164203613, + "grad_norm": 1.465224027633667, + "learning_rate": 0.0002, + "loss": 0.3219, + "step": 5820 + }, + { + "epoch": 6.382047071702244, + "grad_norm": 1.437408447265625, + "learning_rate": 0.0002, + "loss": 0.2703, + "step": 5830 + }, + { + "epoch": 6.392993979200876, + "grad_norm": 1.3094626665115356, + "learning_rate": 0.0002, + "loss": 0.3012, + "step": 5840 + }, + { + "epoch": 6.403940886699507, + "grad_norm": 1.6717544794082642, + "learning_rate": 0.0002, + "loss": 0.2991, + "step": 5850 + }, + { + "epoch": 6.414887794198139, + "grad_norm": 1.1023344993591309, + "learning_rate": 0.0002, + "loss": 0.2892, + "step": 5860 + }, + { + "epoch": 6.425834701696771, + "grad_norm": 1.2397106885910034, + "learning_rate": 0.0002, + "loss": 0.3078, + "step": 5870 + }, + { + "epoch": 6.436781609195402, + "grad_norm": 1.6139185428619385, + "learning_rate": 0.0002, + "loss": 0.2984, + "step": 5880 + }, + { + "epoch": 6.447728516694034, + "grad_norm": 1.3164576292037964, + "learning_rate": 0.0002, + "loss": 0.2353, + "step": 5890 + }, + { + "epoch": 6.458675424192665, + "grad_norm": 1.3317217826843262, + "learning_rate": 0.0002, + "loss": 0.2772, + "step": 5900 + }, + { + "epoch": 6.469622331691297, + "grad_norm": 1.215008020401001, + "learning_rate": 0.0002, + "loss": 0.2555, + "step": 5910 + }, + { + "epoch": 6.480569239189929, + "grad_norm": 1.625672698020935, + "learning_rate": 0.0002, + "loss": 0.2715, + "step": 5920 + }, + { + "epoch": 6.491516146688561, + "grad_norm": 1.1262489557266235, + "learning_rate": 0.0002, + "loss": 0.2938, + "step": 5930 + }, + { + "epoch": 6.502463054187192, + "grad_norm": 1.447100281715393, + "learning_rate": 0.0002, + "loss": 0.2921, + "step": 5940 + }, + { + "epoch": 6.513409961685824, + "grad_norm": 1.3306448459625244, + "learning_rate": 0.0002, + "loss": 0.3059, + "step": 5950 + }, + { + "epoch": 6.524356869184455, + "grad_norm": 1.307732105255127, + "learning_rate": 0.0002, + "loss": 0.2922, + "step": 5960 + }, + { + "epoch": 6.535303776683087, + "grad_norm": 1.1851097345352173, + "learning_rate": 0.0002, + "loss": 0.2891, + "step": 5970 + }, + { + "epoch": 6.546250684181719, + "grad_norm": 1.462816596031189, + "learning_rate": 0.0002, + "loss": 0.2859, + "step": 5980 + }, + { + "epoch": 6.55719759168035, + "grad_norm": 1.2324728965759277, + "learning_rate": 0.0002, + "loss": 0.2698, + "step": 5990 + }, + { + "epoch": 6.568144499178982, + "grad_norm": 1.3627429008483887, + "learning_rate": 0.0002, + "loss": 0.2672, + "step": 6000 + }, + { + "epoch": 6.579091406677613, + "grad_norm": 1.94977867603302, + "learning_rate": 0.0002, + "loss": 0.3182, + "step": 6010 + }, + { + "epoch": 6.590038314176245, + "grad_norm": 1.459844946861267, + "learning_rate": 0.0002, + "loss": 0.3183, + "step": 6020 + }, + { + "epoch": 6.600985221674877, + "grad_norm": 1.4454325437545776, + "learning_rate": 0.0002, + "loss": 0.3142, + "step": 6030 + }, + { + "epoch": 6.611932129173509, + "grad_norm": 1.4245165586471558, + "learning_rate": 0.0002, + "loss": 0.269, + "step": 6040 + }, + { + "epoch": 6.62287903667214, + "grad_norm": 1.195803165435791, + "learning_rate": 0.0002, + "loss": 0.3041, + "step": 6050 + }, + { + "epoch": 6.633825944170772, + "grad_norm": 1.3589898347854614, + "learning_rate": 0.0002, + "loss": 0.3075, + "step": 6060 + }, + { + "epoch": 6.644772851669403, + "grad_norm": 1.3488036394119263, + "learning_rate": 0.0002, + "loss": 0.3291, + "step": 6070 + }, + { + "epoch": 6.655719759168035, + "grad_norm": 1.0954102277755737, + "learning_rate": 0.0002, + "loss": 0.2898, + "step": 6080 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.4431062936782837, + "learning_rate": 0.0002, + "loss": 0.3489, + "step": 6090 + }, + { + "epoch": 6.677613574165298, + "grad_norm": 1.4387465715408325, + "learning_rate": 0.0002, + "loss": 0.2816, + "step": 6100 + }, + { + "epoch": 6.68856048166393, + "grad_norm": 1.8398990631103516, + "learning_rate": 0.0002, + "loss": 0.2705, + "step": 6110 + }, + { + "epoch": 6.699507389162561, + "grad_norm": 1.3523273468017578, + "learning_rate": 0.0002, + "loss": 0.3214, + "step": 6120 + }, + { + "epoch": 6.710454296661193, + "grad_norm": 1.6326191425323486, + "learning_rate": 0.0002, + "loss": 0.287, + "step": 6130 + }, + { + "epoch": 6.721401204159825, + "grad_norm": 1.3677960634231567, + "learning_rate": 0.0002, + "loss": 0.2857, + "step": 6140 + }, + { + "epoch": 6.732348111658457, + "grad_norm": 1.1993201971054077, + "learning_rate": 0.0002, + "loss": 0.3264, + "step": 6150 + }, + { + "epoch": 6.743295019157088, + "grad_norm": 1.1864078044891357, + "learning_rate": 0.0002, + "loss": 0.3071, + "step": 6160 + }, + { + "epoch": 6.75424192665572, + "grad_norm": 1.1625522375106812, + "learning_rate": 0.0002, + "loss": 0.3087, + "step": 6170 + }, + { + "epoch": 6.765188834154351, + "grad_norm": 1.5803234577178955, + "learning_rate": 0.0002, + "loss": 0.3551, + "step": 6180 + }, + { + "epoch": 6.776135741652983, + "grad_norm": 1.151746153831482, + "learning_rate": 0.0002, + "loss": 0.3059, + "step": 6190 + }, + { + "epoch": 6.787082649151615, + "grad_norm": 1.0727161169052124, + "learning_rate": 0.0002, + "loss": 0.2697, + "step": 6200 + }, + { + "epoch": 6.798029556650246, + "grad_norm": 1.4148162603378296, + "learning_rate": 0.0002, + "loss": 0.2844, + "step": 6210 + }, + { + "epoch": 6.808976464148878, + "grad_norm": 1.2071447372436523, + "learning_rate": 0.0002, + "loss": 0.3417, + "step": 6220 + }, + { + "epoch": 6.819923371647509, + "grad_norm": 1.3843804597854614, + "learning_rate": 0.0002, + "loss": 0.3066, + "step": 6230 + }, + { + "epoch": 6.830870279146141, + "grad_norm": 1.2490662336349487, + "learning_rate": 0.0002, + "loss": 0.2769, + "step": 6240 + }, + { + "epoch": 6.841817186644773, + "grad_norm": 1.6029689311981201, + "learning_rate": 0.0002, + "loss": 0.3237, + "step": 6250 + }, + { + "epoch": 6.852764094143405, + "grad_norm": 1.0388455390930176, + "learning_rate": 0.0002, + "loss": 0.3152, + "step": 6260 + }, + { + "epoch": 6.863711001642036, + "grad_norm": 1.3883857727050781, + "learning_rate": 0.0002, + "loss": 0.3026, + "step": 6270 + }, + { + "epoch": 6.874657909140668, + "grad_norm": 1.0500187873840332, + "learning_rate": 0.0002, + "loss": 0.3175, + "step": 6280 + }, + { + "epoch": 6.885604816639299, + "grad_norm": 1.4243487119674683, + "learning_rate": 0.0002, + "loss": 0.2952, + "step": 6290 + }, + { + "epoch": 6.896551724137931, + "grad_norm": 1.3169665336608887, + "learning_rate": 0.0002, + "loss": 0.2679, + "step": 6300 + }, + { + "epoch": 6.907498631636563, + "grad_norm": 1.5261493921279907, + "learning_rate": 0.0002, + "loss": 0.3291, + "step": 6310 + }, + { + "epoch": 6.9184455391351944, + "grad_norm": 1.578403115272522, + "learning_rate": 0.0002, + "loss": 0.3344, + "step": 6320 + }, + { + "epoch": 6.929392446633826, + "grad_norm": 1.4093263149261475, + "learning_rate": 0.0002, + "loss": 0.3263, + "step": 6330 + }, + { + "epoch": 6.940339354132457, + "grad_norm": 1.4003552198410034, + "learning_rate": 0.0002, + "loss": 0.3396, + "step": 6340 + }, + { + "epoch": 6.951286261631089, + "grad_norm": 1.650190830230713, + "learning_rate": 0.0002, + "loss": 0.3476, + "step": 6350 + }, + { + "epoch": 6.962233169129721, + "grad_norm": 1.2314515113830566, + "learning_rate": 0.0002, + "loss": 0.3442, + "step": 6360 + }, + { + "epoch": 6.973180076628353, + "grad_norm": 1.270980954170227, + "learning_rate": 0.0002, + "loss": 0.3341, + "step": 6370 + }, + { + "epoch": 6.984126984126984, + "grad_norm": 1.6352545022964478, + "learning_rate": 0.0002, + "loss": 0.3425, + "step": 6380 + }, + { + "epoch": 6.995073891625616, + "grad_norm": 1.3744925260543823, + "learning_rate": 0.0002, + "loss": 0.3647, + "step": 6390 + }, + { + "epoch": 6.999452654625069, + "eval_loss": 1.756764531135559, + "eval_runtime": 46.0542, + "eval_samples_per_second": 9.467, + "eval_steps_per_second": 1.194, + "step": 6394 + }, + { + "epoch": 7.006020799124247, + "grad_norm": 0.856991171836853, + "learning_rate": 0.0002, + "loss": 0.2356, + "step": 6400 + }, + { + "epoch": 7.016967706622879, + "grad_norm": 0.9483422040939331, + "learning_rate": 0.0002, + "loss": 0.2138, + "step": 6410 + }, + { + "epoch": 7.027914614121511, + "grad_norm": 1.0703433752059937, + "learning_rate": 0.0002, + "loss": 0.1884, + "step": 6420 + }, + { + "epoch": 7.0388615216201424, + "grad_norm": 1.761413812637329, + "learning_rate": 0.0002, + "loss": 0.2088, + "step": 6430 + }, + { + "epoch": 7.049808429118774, + "grad_norm": 0.9484238028526306, + "learning_rate": 0.0002, + "loss": 0.2156, + "step": 6440 + }, + { + "epoch": 7.060755336617405, + "grad_norm": 1.5663186311721802, + "learning_rate": 0.0002, + "loss": 0.2323, + "step": 6450 + }, + { + "epoch": 7.071702244116037, + "grad_norm": 0.7692174315452576, + "learning_rate": 0.0002, + "loss": 0.2089, + "step": 6460 + }, + { + "epoch": 7.082649151614669, + "grad_norm": 1.3554800748825073, + "learning_rate": 0.0002, + "loss": 0.1856, + "step": 6470 + }, + { + "epoch": 7.093596059113301, + "grad_norm": 0.9705919027328491, + "learning_rate": 0.0002, + "loss": 0.2057, + "step": 6480 + }, + { + "epoch": 7.104542966611932, + "grad_norm": 1.355778694152832, + "learning_rate": 0.0002, + "loss": 0.2068, + "step": 6490 + }, + { + "epoch": 7.115489874110564, + "grad_norm": 1.5389477014541626, + "learning_rate": 0.0002, + "loss": 0.2021, + "step": 6500 + }, + { + "epoch": 7.126436781609195, + "grad_norm": 0.9565434455871582, + "learning_rate": 0.0002, + "loss": 0.1963, + "step": 6510 + }, + { + "epoch": 7.137383689107827, + "grad_norm": 1.101539134979248, + "learning_rate": 0.0002, + "loss": 0.2323, + "step": 6520 + }, + { + "epoch": 7.148330596606459, + "grad_norm": 0.925153374671936, + "learning_rate": 0.0002, + "loss": 0.2024, + "step": 6530 + }, + { + "epoch": 7.1592775041050905, + "grad_norm": 1.1609078645706177, + "learning_rate": 0.0002, + "loss": 0.1955, + "step": 6540 + }, + { + "epoch": 7.170224411603722, + "grad_norm": 0.8908484578132629, + "learning_rate": 0.0002, + "loss": 0.2022, + "step": 6550 + }, + { + "epoch": 7.181171319102353, + "grad_norm": 0.9066158533096313, + "learning_rate": 0.0002, + "loss": 0.2378, + "step": 6560 + }, + { + "epoch": 7.192118226600985, + "grad_norm": 1.3601553440093994, + "learning_rate": 0.0002, + "loss": 0.1955, + "step": 6570 + }, + { + "epoch": 7.203065134099617, + "grad_norm": 1.0034444332122803, + "learning_rate": 0.0002, + "loss": 0.241, + "step": 6580 + }, + { + "epoch": 7.214012041598249, + "grad_norm": 1.608299970626831, + "learning_rate": 0.0002, + "loss": 0.2134, + "step": 6590 + }, + { + "epoch": 7.22495894909688, + "grad_norm": 1.2889668941497803, + "learning_rate": 0.0002, + "loss": 0.2089, + "step": 6600 + }, + { + "epoch": 7.235905856595512, + "grad_norm": 0.9896159768104553, + "learning_rate": 0.0002, + "loss": 0.2405, + "step": 6610 + }, + { + "epoch": 7.246852764094143, + "grad_norm": 1.408511996269226, + "learning_rate": 0.0002, + "loss": 0.2091, + "step": 6620 + }, + { + "epoch": 7.257799671592775, + "grad_norm": 1.0823664665222168, + "learning_rate": 0.0002, + "loss": 0.1958, + "step": 6630 + }, + { + "epoch": 7.268746579091407, + "grad_norm": 1.027026891708374, + "learning_rate": 0.0002, + "loss": 0.2117, + "step": 6640 + }, + { + "epoch": 7.2796934865900385, + "grad_norm": 1.0922648906707764, + "learning_rate": 0.0002, + "loss": 0.2319, + "step": 6650 + }, + { + "epoch": 7.29064039408867, + "grad_norm": 1.3361082077026367, + "learning_rate": 0.0002, + "loss": 0.2367, + "step": 6660 + }, + { + "epoch": 7.301587301587301, + "grad_norm": 1.9565683603286743, + "learning_rate": 0.0002, + "loss": 0.2299, + "step": 6670 + }, + { + "epoch": 7.312534209085933, + "grad_norm": 1.413672685623169, + "learning_rate": 0.0002, + "loss": 0.2248, + "step": 6680 + }, + { + "epoch": 7.323481116584565, + "grad_norm": 1.121842384338379, + "learning_rate": 0.0002, + "loss": 0.2306, + "step": 6690 + }, + { + "epoch": 7.334428024083197, + "grad_norm": 1.0622057914733887, + "learning_rate": 0.0002, + "loss": 0.2222, + "step": 6700 + }, + { + "epoch": 7.345374931581828, + "grad_norm": 1.280921459197998, + "learning_rate": 0.0002, + "loss": 0.2387, + "step": 6710 + }, + { + "epoch": 7.35632183908046, + "grad_norm": 1.5295953750610352, + "learning_rate": 0.0002, + "loss": 0.2428, + "step": 6720 + }, + { + "epoch": 7.367268746579091, + "grad_norm": 1.4289230108261108, + "learning_rate": 0.0002, + "loss": 0.2149, + "step": 6730 + }, + { + "epoch": 7.378215654077723, + "grad_norm": 1.535111665725708, + "learning_rate": 0.0002, + "loss": 0.2172, + "step": 6740 + }, + { + "epoch": 7.389162561576355, + "grad_norm": 1.777826189994812, + "learning_rate": 0.0002, + "loss": 0.2262, + "step": 6750 + }, + { + "epoch": 7.4001094690749865, + "grad_norm": 1.5058139562606812, + "learning_rate": 0.0002, + "loss": 0.2246, + "step": 6760 + }, + { + "epoch": 7.411056376573618, + "grad_norm": 0.9381663203239441, + "learning_rate": 0.0002, + "loss": 0.2323, + "step": 6770 + }, + { + "epoch": 7.422003284072249, + "grad_norm": 1.4739434719085693, + "learning_rate": 0.0002, + "loss": 0.2078, + "step": 6780 + }, + { + "epoch": 7.432950191570881, + "grad_norm": 1.8703559637069702, + "learning_rate": 0.0002, + "loss": 0.2493, + "step": 6790 + }, + { + "epoch": 7.443897099069513, + "grad_norm": 1.2242027521133423, + "learning_rate": 0.0002, + "loss": 0.2554, + "step": 6800 + }, + { + "epoch": 7.454844006568145, + "grad_norm": 1.3950374126434326, + "learning_rate": 0.0002, + "loss": 0.2258, + "step": 6810 + }, + { + "epoch": 7.465790914066776, + "grad_norm": 1.461701512336731, + "learning_rate": 0.0002, + "loss": 0.2365, + "step": 6820 + }, + { + "epoch": 7.476737821565408, + "grad_norm": 1.4460340738296509, + "learning_rate": 0.0002, + "loss": 0.2302, + "step": 6830 + }, + { + "epoch": 7.487684729064039, + "grad_norm": 1.0341510772705078, + "learning_rate": 0.0002, + "loss": 0.2294, + "step": 6840 + }, + { + "epoch": 7.498631636562671, + "grad_norm": 0.8885145783424377, + "learning_rate": 0.0002, + "loss": 0.2338, + "step": 6850 + }, + { + "epoch": 7.509578544061303, + "grad_norm": 2.4326062202453613, + "learning_rate": 0.0002, + "loss": 0.2424, + "step": 6860 + }, + { + "epoch": 7.5205254515599345, + "grad_norm": 1.1390372514724731, + "learning_rate": 0.0002, + "loss": 0.2352, + "step": 6870 + }, + { + "epoch": 7.531472359058566, + "grad_norm": 1.2346464395523071, + "learning_rate": 0.0002, + "loss": 0.2184, + "step": 6880 + }, + { + "epoch": 7.542419266557197, + "grad_norm": 1.6705836057662964, + "learning_rate": 0.0002, + "loss": 0.2389, + "step": 6890 + }, + { + "epoch": 7.553366174055829, + "grad_norm": 0.8130379319190979, + "learning_rate": 0.0002, + "loss": 0.2346, + "step": 6900 + }, + { + "epoch": 7.564313081554461, + "grad_norm": 1.2974088191986084, + "learning_rate": 0.0002, + "loss": 0.2165, + "step": 6910 + }, + { + "epoch": 7.575259989053093, + "grad_norm": 1.3465348482131958, + "learning_rate": 0.0002, + "loss": 0.2328, + "step": 6920 + }, + { + "epoch": 7.586206896551724, + "grad_norm": 1.245126724243164, + "learning_rate": 0.0002, + "loss": 0.2789, + "step": 6930 + }, + { + "epoch": 7.597153804050356, + "grad_norm": 1.3736917972564697, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 6940 + }, + { + "epoch": 7.608100711548987, + "grad_norm": 1.340989351272583, + "learning_rate": 0.0002, + "loss": 0.2483, + "step": 6950 + }, + { + "epoch": 7.619047619047619, + "grad_norm": 1.1082850694656372, + "learning_rate": 0.0002, + "loss": 0.3014, + "step": 6960 + }, + { + "epoch": 7.629994526546251, + "grad_norm": 1.3829188346862793, + "learning_rate": 0.0002, + "loss": 0.2319, + "step": 6970 + }, + { + "epoch": 7.6409414340448825, + "grad_norm": 1.5384989976882935, + "learning_rate": 0.0002, + "loss": 0.2299, + "step": 6980 + }, + { + "epoch": 7.651888341543514, + "grad_norm": 1.1061540842056274, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 6990 + }, + { + "epoch": 7.662835249042145, + "grad_norm": 1.2673815488815308, + "learning_rate": 0.0002, + "loss": 0.2216, + "step": 7000 + }, + { + "epoch": 7.673782156540777, + "grad_norm": 1.2290737628936768, + "learning_rate": 0.0002, + "loss": 0.2397, + "step": 7010 + }, + { + "epoch": 7.684729064039409, + "grad_norm": 1.4055291414260864, + "learning_rate": 0.0002, + "loss": 0.2112, + "step": 7020 + }, + { + "epoch": 7.695675971538041, + "grad_norm": 1.7786750793457031, + "learning_rate": 0.0002, + "loss": 0.2548, + "step": 7030 + }, + { + "epoch": 7.706622879036672, + "grad_norm": 1.454209566116333, + "learning_rate": 0.0002, + "loss": 0.2241, + "step": 7040 + }, + { + "epoch": 7.717569786535304, + "grad_norm": 1.3995633125305176, + "learning_rate": 0.0002, + "loss": 0.2461, + "step": 7050 + }, + { + "epoch": 7.728516694033935, + "grad_norm": 1.7514715194702148, + "learning_rate": 0.0002, + "loss": 0.2785, + "step": 7060 + }, + { + "epoch": 7.739463601532567, + "grad_norm": 1.5538004636764526, + "learning_rate": 0.0002, + "loss": 0.2335, + "step": 7070 + }, + { + "epoch": 7.750410509031199, + "grad_norm": 1.122506856918335, + "learning_rate": 0.0002, + "loss": 0.2245, + "step": 7080 + }, + { + "epoch": 7.7613574165298305, + "grad_norm": 1.2445831298828125, + "learning_rate": 0.0002, + "loss": 0.2473, + "step": 7090 + }, + { + "epoch": 7.772304324028462, + "grad_norm": 1.1478949785232544, + "learning_rate": 0.0002, + "loss": 0.2421, + "step": 7100 + }, + { + "epoch": 7.783251231527093, + "grad_norm": 1.4352518320083618, + "learning_rate": 0.0002, + "loss": 0.2346, + "step": 7110 + }, + { + "epoch": 7.794198139025725, + "grad_norm": 1.511096715927124, + "learning_rate": 0.0002, + "loss": 0.2351, + "step": 7120 + }, + { + "epoch": 7.805145046524357, + "grad_norm": 1.2296271324157715, + "learning_rate": 0.0002, + "loss": 0.2586, + "step": 7130 + }, + { + "epoch": 7.816091954022989, + "grad_norm": 1.7886443138122559, + "learning_rate": 0.0002, + "loss": 0.2453, + "step": 7140 + }, + { + "epoch": 7.82703886152162, + "grad_norm": 1.8886322975158691, + "learning_rate": 0.0002, + "loss": 0.2645, + "step": 7150 + }, + { + "epoch": 7.837985769020252, + "grad_norm": 1.3493725061416626, + "learning_rate": 0.0002, + "loss": 0.2658, + "step": 7160 + }, + { + "epoch": 7.848932676518883, + "grad_norm": 1.379209041595459, + "learning_rate": 0.0002, + "loss": 0.2787, + "step": 7170 + }, + { + "epoch": 7.859879584017515, + "grad_norm": 0.9374330043792725, + "learning_rate": 0.0002, + "loss": 0.248, + "step": 7180 + }, + { + "epoch": 7.870826491516147, + "grad_norm": 1.0391291379928589, + "learning_rate": 0.0002, + "loss": 0.2412, + "step": 7190 + }, + { + "epoch": 7.8817733990147785, + "grad_norm": 1.2710281610488892, + "learning_rate": 0.0002, + "loss": 0.2648, + "step": 7200 + }, + { + "epoch": 7.89272030651341, + "grad_norm": 1.6858662366867065, + "learning_rate": 0.0002, + "loss": 0.2511, + "step": 7210 + }, + { + "epoch": 7.903667214012041, + "grad_norm": 1.0925853252410889, + "learning_rate": 0.0002, + "loss": 0.2667, + "step": 7220 + }, + { + "epoch": 7.914614121510673, + "grad_norm": 1.7404073476791382, + "learning_rate": 0.0002, + "loss": 0.2756, + "step": 7230 + }, + { + "epoch": 7.925561029009305, + "grad_norm": 1.284067153930664, + "learning_rate": 0.0002, + "loss": 0.2486, + "step": 7240 + }, + { + "epoch": 7.936507936507937, + "grad_norm": 1.3801543712615967, + "learning_rate": 0.0002, + "loss": 0.2394, + "step": 7250 + }, + { + "epoch": 7.947454844006568, + "grad_norm": 1.4068974256515503, + "learning_rate": 0.0002, + "loss": 0.2761, + "step": 7260 + }, + { + "epoch": 7.9584017515052, + "grad_norm": 1.770037055015564, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 7270 + }, + { + "epoch": 7.969348659003831, + "grad_norm": 1.473775029182434, + "learning_rate": 0.0002, + "loss": 0.2863, + "step": 7280 + }, + { + "epoch": 7.980295566502463, + "grad_norm": 1.4878343343734741, + "learning_rate": 0.0002, + "loss": 0.2722, + "step": 7290 + }, + { + "epoch": 7.991242474001095, + "grad_norm": 1.2178987264633179, + "learning_rate": 0.0002, + "loss": 0.2921, + "step": 7300 + }, + { + "epoch": 7.995621237000547, + "eval_loss": 1.9267498254776, + "eval_runtime": 46.049, + "eval_samples_per_second": 9.468, + "eval_steps_per_second": 1.194, + "step": 7304 + } + ], + "logging_steps": 10, + "max_steps": 7304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.7516079806808064e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..895406585c61ed7f5a6760976fc06a40a6fd8b3b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-7304/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a158bca77bc6d58bfffd2acd2e9e641d481e1857fcbdd68fee875b74797abbc4 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..34b84c2d4cfdf217d0df26e8708c3bd34577ba32 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c63de276026ae040ebafb983d9df61b6c14a219ecb096cfd19bb9cbbdf5202b +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ebbf61940b0cd58679a1f0a656e7f180625f247b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50d27f47005f9bfcb7cc81d95f6db11766464cf433ebe0ec3b9bfd4cd2fa81de +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2e77fe7c65ab745e9d0c59f7bb780c34a5e85c99 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7262fed64284ee400ff89a52ad0dc0c5a3cc8afbab1a554324105e268b7058a8 +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1aded4707a46db5a73a88fa0f1af5b5ac2eaa8a --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d68854151aa7f3535bf90a34c0de0f805fa3a448a25d2ac0eae91da60149361 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..410338bf7cb217e54fd7fba275e5771720427762 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/trainer_state.json @@ -0,0 +1,678 @@ +{ + "best_metric": 1.158464789390564, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913", + "epoch": 0.9994526546250684, + "eval_steps": 10, + "global_step": 913, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.010946907498631636, + "grad_norm": 0.7611560821533203, + "learning_rate": 0.0002, + "loss": 2.4039, + "step": 10 + }, + { + "epoch": 0.021893814997263273, + "grad_norm": 0.4633193612098694, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 20 + }, + { + "epoch": 0.03284072249589491, + "grad_norm": 0.49326154589653015, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 30 + }, + { + "epoch": 0.043787629994526546, + "grad_norm": 0.3943138122558594, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 40 + }, + { + "epoch": 0.05473453749315818, + "grad_norm": 0.43292930722236633, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 50 + }, + { + "epoch": 0.06568144499178982, + "grad_norm": 0.3431817591190338, + "learning_rate": 0.0002, + "loss": 1.2427, + "step": 60 + }, + { + "epoch": 0.07662835249042145, + "grad_norm": 0.38011446595191956, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 70 + }, + { + "epoch": 0.08757525998905309, + "grad_norm": 0.7441071271896362, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 80 + }, + { + "epoch": 0.09852216748768473, + "grad_norm": 0.3359833061695099, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 90 + }, + { + "epoch": 0.10946907498631636, + "grad_norm": 0.3724392354488373, + "learning_rate": 0.0002, + "loss": 1.2774, + "step": 100 + }, + { + "epoch": 0.120415982484948, + "grad_norm": 0.40673762559890747, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 110 + }, + { + "epoch": 0.13136288998357964, + "grad_norm": 0.40036800503730774, + "learning_rate": 0.0002, + "loss": 1.2611, + "step": 120 + }, + { + "epoch": 0.1423097974822113, + "grad_norm": 2.844191312789917, + "learning_rate": 0.0002, + "loss": 1.2436, + "step": 130 + }, + { + "epoch": 0.1532567049808429, + "grad_norm": 0.3104734420776367, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 140 + }, + { + "epoch": 0.16420361247947454, + "grad_norm": 0.3266797959804535, + "learning_rate": 0.0002, + "loss": 1.0498, + "step": 150 + }, + { + "epoch": 0.17515051997810618, + "grad_norm": 0.3079199194908142, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 160 + }, + { + "epoch": 0.18609742747673783, + "grad_norm": 0.3872479498386383, + "learning_rate": 0.0002, + "loss": 1.2923, + "step": 170 + }, + { + "epoch": 0.19704433497536947, + "grad_norm": 0.38654500246047974, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 180 + }, + { + "epoch": 0.20799124247400108, + "grad_norm": 0.2913552522659302, + "learning_rate": 0.0002, + "loss": 1.066, + "step": 190 + }, + { + "epoch": 0.21893814997263272, + "grad_norm": 0.2960572838783264, + "learning_rate": 0.0002, + "loss": 1.0868, + "step": 200 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 0.5175501108169556, + "learning_rate": 0.0002, + "loss": 1.136, + "step": 210 + }, + { + "epoch": 0.240831964969896, + "grad_norm": 1.2921574115753174, + "learning_rate": 0.0002, + "loss": 1.1556, + "step": 220 + }, + { + "epoch": 0.25177887246852765, + "grad_norm": 0.2675512135028839, + "learning_rate": 0.0002, + "loss": 1.1488, + "step": 230 + }, + { + "epoch": 0.2627257799671593, + "grad_norm": 0.3956190049648285, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 240 + }, + { + "epoch": 0.27367268746579093, + "grad_norm": 0.6022581458091736, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 250 + }, + { + "epoch": 0.2846195949644226, + "grad_norm": 1.1949563026428223, + "learning_rate": 0.0002, + "loss": 1.1981, + "step": 260 + }, + { + "epoch": 0.2955665024630542, + "grad_norm": 0.31173548102378845, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 270 + }, + { + "epoch": 0.3065134099616858, + "grad_norm": 0.2808472812175751, + "learning_rate": 0.0002, + "loss": 1.06, + "step": 280 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 0.3042023777961731, + "learning_rate": 0.0002, + "loss": 1.1752, + "step": 290 + }, + { + "epoch": 0.3284072249589491, + "grad_norm": 0.39915043115615845, + "learning_rate": 0.0002, + "loss": 1.3147, + "step": 300 + }, + { + "epoch": 0.3393541324575807, + "grad_norm": 0.39118197560310364, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 310 + }, + { + "epoch": 0.35030103995621237, + "grad_norm": 0.355010986328125, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 320 + }, + { + "epoch": 0.361247947454844, + "grad_norm": 0.29734086990356445, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 330 + }, + { + "epoch": 0.37219485495347565, + "grad_norm": 0.346096009016037, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 340 + }, + { + "epoch": 0.3831417624521073, + "grad_norm": 0.4829643666744232, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 350 + }, + { + "epoch": 0.39408866995073893, + "grad_norm": 0.4726872742176056, + "learning_rate": 0.0002, + "loss": 1.2808, + "step": 360 + }, + { + "epoch": 0.4050355774493706, + "grad_norm": 0.3130153715610504, + "learning_rate": 0.0002, + "loss": 1.1532, + "step": 370 + }, + { + "epoch": 0.41598248494800216, + "grad_norm": 0.5123590230941772, + "learning_rate": 0.0002, + "loss": 1.1842, + "step": 380 + }, + { + "epoch": 0.4269293924466338, + "grad_norm": 0.3444574773311615, + "learning_rate": 0.0002, + "loss": 1.1539, + "step": 390 + }, + { + "epoch": 0.43787629994526545, + "grad_norm": 0.5302175283432007, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 400 + }, + { + "epoch": 0.4488232074438971, + "grad_norm": 0.2713572680950165, + "learning_rate": 0.0002, + "loss": 1.1138, + "step": 410 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 0.29530611634254456, + "learning_rate": 0.0002, + "loss": 1.1281, + "step": 420 + }, + { + "epoch": 0.47071702244116037, + "grad_norm": 0.27282455563545227, + "learning_rate": 0.0002, + "loss": 1.1721, + "step": 430 + }, + { + "epoch": 0.481663929939792, + "grad_norm": 0.2647949755191803, + "learning_rate": 0.0002, + "loss": 1.1213, + "step": 440 + }, + { + "epoch": 0.49261083743842365, + "grad_norm": 0.35509347915649414, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 450 + }, + { + "epoch": 0.5035577449370553, + "grad_norm": 0.1959609091281891, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 460 + }, + { + "epoch": 0.5145046524356869, + "grad_norm": 0.40090155601501465, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 470 + }, + { + "epoch": 0.5254515599343186, + "grad_norm": 0.3354604244232178, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 480 + }, + { + "epoch": 0.5363984674329502, + "grad_norm": 0.2758506238460541, + "learning_rate": 0.0002, + "loss": 1.1024, + "step": 490 + }, + { + "epoch": 0.5473453749315819, + "grad_norm": 0.3629051744937897, + "learning_rate": 0.0002, + "loss": 1.1108, + "step": 500 + }, + { + "epoch": 0.5582922824302134, + "grad_norm": 0.30802229046821594, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 510 + }, + { + "epoch": 0.5692391899288451, + "grad_norm": 0.3099463880062103, + "learning_rate": 0.0002, + "loss": 1.0424, + "step": 520 + }, + { + "epoch": 0.5801860974274767, + "grad_norm": 0.42299067974090576, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 530 + }, + { + "epoch": 0.5911330049261084, + "grad_norm": 0.5392252802848816, + "learning_rate": 0.0002, + "loss": 1.1698, + "step": 540 + }, + { + "epoch": 0.60207991242474, + "grad_norm": 0.34768250584602356, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 550 + }, + { + "epoch": 0.6130268199233716, + "grad_norm": 0.28490015864372253, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 560 + }, + { + "epoch": 0.6239737274220033, + "grad_norm": 0.34787994623184204, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 570 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.29058772325515747, + "learning_rate": 0.0002, + "loss": 1.0759, + "step": 580 + }, + { + "epoch": 0.6458675424192666, + "grad_norm": 0.4063778817653656, + "learning_rate": 0.0002, + "loss": 1.1157, + "step": 590 + }, + { + "epoch": 0.6568144499178982, + "grad_norm": 0.9244267344474792, + "learning_rate": 0.0002, + "loss": 1.1432, + "step": 600 + }, + { + "epoch": 0.6677613574165299, + "grad_norm": 0.27605190873146057, + "learning_rate": 0.0002, + "loss": 1.0591, + "step": 610 + }, + { + "epoch": 0.6787082649151615, + "grad_norm": 0.34346821904182434, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 620 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.3093279302120209, + "learning_rate": 0.0002, + "loss": 1.2195, + "step": 630 + }, + { + "epoch": 0.7006020799124247, + "grad_norm": 1.0069009065628052, + "learning_rate": 0.0002, + "loss": 1.2461, + "step": 640 + }, + { + "epoch": 0.7115489874110563, + "grad_norm": 0.5049130916595459, + "learning_rate": 0.0002, + "loss": 1.0892, + "step": 650 + }, + { + "epoch": 0.722495894909688, + "grad_norm": 0.3748924732208252, + "learning_rate": 0.0002, + "loss": 1.1404, + "step": 660 + }, + { + "epoch": 0.7334428024083196, + "grad_norm": 0.2964959144592285, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 670 + }, + { + "epoch": 0.7443897099069513, + "grad_norm": 0.4599986970424652, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 680 + }, + { + "epoch": 0.7553366174055829, + "grad_norm": 0.27292951941490173, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 690 + }, + { + "epoch": 0.7662835249042146, + "grad_norm": 0.3123566806316376, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 700 + }, + { + "epoch": 0.7772304324028462, + "grad_norm": 0.28310710191726685, + "learning_rate": 0.0002, + "loss": 1.0021, + "step": 710 + }, + { + "epoch": 0.7881773399014779, + "grad_norm": 0.3279992341995239, + "learning_rate": 0.0002, + "loss": 1.2281, + "step": 720 + }, + { + "epoch": 0.7991242474001095, + "grad_norm": 0.28179168701171875, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 730 + }, + { + "epoch": 0.8100711548987412, + "grad_norm": 0.31492987275123596, + "learning_rate": 0.0002, + "loss": 1.0602, + "step": 740 + }, + { + "epoch": 0.8210180623973727, + "grad_norm": 0.41821011900901794, + "learning_rate": 0.0002, + "loss": 1.2518, + "step": 750 + }, + { + "epoch": 0.8319649698960043, + "grad_norm": 0.325235515832901, + "learning_rate": 0.0002, + "loss": 1.1612, + "step": 760 + }, + { + "epoch": 0.842911877394636, + "grad_norm": 0.5366070866584778, + "learning_rate": 0.0002, + "loss": 1.27, + "step": 770 + }, + { + "epoch": 0.8538587848932676, + "grad_norm": 0.32570579648017883, + "learning_rate": 0.0002, + "loss": 1.0921, + "step": 780 + }, + { + "epoch": 0.8648056923918993, + "grad_norm": 0.3642968237400055, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 790 + }, + { + "epoch": 0.8757525998905309, + "grad_norm": 0.29713448882102966, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 800 + }, + { + "epoch": 0.8866995073891626, + "grad_norm": 0.23599444329738617, + "learning_rate": 0.0002, + "loss": 1.0978, + "step": 810 + }, + { + "epoch": 0.8976464148877942, + "grad_norm": 0.31522464752197266, + "learning_rate": 0.0002, + "loss": 1.1867, + "step": 820 + }, + { + "epoch": 0.9085933223864259, + "grad_norm": 0.32754790782928467, + "learning_rate": 0.0002, + "loss": 1.0208, + "step": 830 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 0.22741089761257172, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 840 + }, + { + "epoch": 0.9304871373836892, + "grad_norm": 0.3089679777622223, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 850 + }, + { + "epoch": 0.9414340448823207, + "grad_norm": 0.27440521121025085, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 860 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.3498363792896271, + "learning_rate": 0.0002, + "loss": 1.0417, + "step": 870 + }, + { + "epoch": 0.963327859879584, + "grad_norm": 0.47151854634284973, + "learning_rate": 0.0002, + "loss": 1.269, + "step": 880 + }, + { + "epoch": 0.9742747673782156, + "grad_norm": 0.24756591022014618, + "learning_rate": 0.0002, + "loss": 1.1174, + "step": 890 + }, + { + "epoch": 0.9852216748768473, + "grad_norm": 0.2600938677787781, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 900 + }, + { + "epoch": 0.9961685823754789, + "grad_norm": 0.2934586703777313, + "learning_rate": 0.0002, + "loss": 1.0968, + "step": 910 + }, + { + "epoch": 0.9994526546250684, + "eval_loss": 1.158464789390564, + "eval_runtime": 46.0774, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 1.194, + "step": 913 + } + ], + "logging_steps": 10, + "max_steps": 7304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.692078163132416e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..895406585c61ed7f5a6760976fc06a40a6fd8b3b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a158bca77bc6d58bfffd2acd2e9e641d481e1857fcbdd68fee875b74797abbc4 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..895406585c61ed7f5a6760976fc06a40a6fd8b3b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a158bca77bc6d58bfffd2acd2e9e641d481e1857fcbdd68fee875b74797abbc4 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/training_log.jsonl b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7214f869187d01d42b242622ce668714cbe28764 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/training_log.jsonl @@ -0,0 +1,13 @@ +{"epoch": 0.9994526546250684, "step": 913, "epoch_duration": 3908.758598089218, "total_accumulated_duration": 3908.758598089218, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4021, "grad_norm": 0.8313556909561157, "learning_rate": 0.0002, "epoch": 0.010946907498631636, "step": 10}, {"loss": 1.8802, "grad_norm": 0.47623705863952637, "learning_rate": 0.0002, "epoch": 0.021893814997263273, "step": 20}, {"loss": 1.5893, "grad_norm": 0.5663179755210876, "learning_rate": 0.0002, "epoch": 0.03284072249589491, "step": 30}, {"loss": 1.3836, "grad_norm": 0.4509344696998596, "learning_rate": 0.0002, "epoch": 0.043787629994526546, "step": 40}, {"loss": 1.3075, "grad_norm": 0.4606502056121826, "learning_rate": 0.0002, "epoch": 0.05473453749315818, "step": 50}, {"loss": 1.2417, "grad_norm": 0.3602466881275177, "learning_rate": 0.0002, "epoch": 0.06568144499178982, "step": 60}, {"loss": 1.3164, "grad_norm": 0.4144728481769562, "learning_rate": 0.0002, "epoch": 0.07662835249042145, "step": 70}, {"loss": 1.1654, "grad_norm": 0.655055046081543, "learning_rate": 0.0002, "epoch": 0.08757525998905309, "step": 80}, {"loss": 1.1713, "grad_norm": 0.3738102614879608, "learning_rate": 0.0002, "epoch": 0.09852216748768473, "step": 90}, {"loss": 1.2749, "grad_norm": 0.37755176424980164, "learning_rate": 0.0002, "epoch": 0.10946907498631636, "step": 100}, {"loss": 1.2094, "grad_norm": 0.4289834797382355, "learning_rate": 0.0002, "epoch": 0.120415982484948, "step": 110}, {"loss": 1.2596, "grad_norm": 0.41391080617904663, "learning_rate": 0.0002, "epoch": 0.13136288998357964, "step": 120}, {"loss": 1.2378, "grad_norm": 0.3661900758743286, "learning_rate": 0.0002, "epoch": 0.1423097974822113, "step": 130}, {"loss": 1.22, "grad_norm": 0.3107413053512573, "learning_rate": 0.0002, "epoch": 0.1532567049808429, "step": 140}, {"loss": 1.0469, "grad_norm": 0.35515156388282776, "learning_rate": 0.0002, "epoch": 0.16420361247947454, "step": 150}, {"loss": 1.1644, "grad_norm": 0.3406173288822174, "learning_rate": 0.0002, "epoch": 0.17515051997810618, "step": 160}, {"loss": 1.2864, "grad_norm": 0.3639129102230072, "learning_rate": 0.0002, "epoch": 0.18609742747673783, "step": 170}, {"loss": 1.1742, "grad_norm": 0.36974409222602844, "learning_rate": 0.0002, "epoch": 0.19704433497536947, "step": 180}, {"loss": 1.0633, "grad_norm": 0.29038330912590027, "learning_rate": 0.0002, "epoch": 0.20799124247400108, "step": 190}, {"loss": 1.0815, "grad_norm": 0.39991503953933716, "learning_rate": 0.0002, "epoch": 0.21893814997263272, "step": 200}, {"loss": 1.1327, "grad_norm": 0.4746928811073303, "learning_rate": 0.0002, "epoch": 0.22988505747126436, "step": 210}, {"loss": 1.1591, "grad_norm": 0.3260287344455719, "learning_rate": 0.0002, "epoch": 0.240831964969896, "step": 220}, {"loss": 1.1432, "grad_norm": 0.2829096019268036, "learning_rate": 0.0002, "epoch": 0.25177887246852765, "step": 230}, {"loss": 1.2742, "grad_norm": 0.3664100170135498, "learning_rate": 0.0002, "epoch": 0.2627257799671593, "step": 240}, {"loss": 1.1872, "grad_norm": 0.3916007876396179, "learning_rate": 0.0002, "epoch": 0.27367268746579093, "step": 250}, {"loss": 1.1883, "grad_norm": 0.2758415937423706, "learning_rate": 0.0002, "epoch": 0.2846195949644226, "step": 260}, {"loss": 1.1854, "grad_norm": 0.3045436143875122, "learning_rate": 0.0002, "epoch": 0.2955665024630542, "step": 270}, {"loss": 1.0588, "grad_norm": 0.28762319684028625, "learning_rate": 0.0002, "epoch": 0.3065134099616858, "step": 280}, {"loss": 1.172, "grad_norm": 0.30335670709609985, "learning_rate": 0.0002, "epoch": 0.31746031746031744, "step": 290}, {"loss": 1.3107, "grad_norm": 1.0805344581604004, "learning_rate": 0.0002, "epoch": 0.3284072249589491, "step": 300}, {"loss": 1.2439, "grad_norm": 0.932421088218689, "learning_rate": 0.0002, "epoch": 0.3393541324575807, "step": 310}, {"loss": 1.1394, "grad_norm": 0.3658805191516876, "learning_rate": 0.0002, "epoch": 0.35030103995621237, "step": 320}, {"loss": 1.1959, "grad_norm": 0.27649226784706116, "learning_rate": 0.0002, "epoch": 0.361247947454844, "step": 330}, {"loss": 1.1926, "grad_norm": 0.34725308418273926, "learning_rate": 0.0002, "epoch": 0.37219485495347565, "step": 340}, {"loss": 1.1601, "grad_norm": 0.427664190530777, "learning_rate": 0.0002, "epoch": 0.3831417624521073, "step": 350}, {"loss": 1.2862, "grad_norm": 0.7244759202003479, "learning_rate": 0.0002, "epoch": 0.39408866995073893, "step": 360}, {"loss": 1.1521, "grad_norm": 0.30189067125320435, "learning_rate": 0.0002, "epoch": 0.4050355774493706, "step": 370}, {"loss": 1.1841, "grad_norm": 0.3003649413585663, "learning_rate": 0.0002, "epoch": 0.41598248494800216, "step": 380}, {"loss": 1.1567, "grad_norm": 0.33777716755867004, "learning_rate": 0.0002, "epoch": 0.4269293924466338, "step": 390}, {"loss": 1.1752, "grad_norm": 0.49972546100616455, "learning_rate": 0.0002, "epoch": 0.43787629994526545, "step": 400}, {"loss": 1.1113, "grad_norm": 0.2706405520439148, "learning_rate": 0.0002, "epoch": 0.4488232074438971, "step": 410}, {"loss": 1.1263, "grad_norm": 0.29881909489631653, "learning_rate": 0.0002, "epoch": 0.45977011494252873, "step": 420}, {"loss": 1.1701, "grad_norm": 0.27096378803253174, "learning_rate": 0.0002, "epoch": 0.47071702244116037, "step": 430}, {"loss": 1.1208, "grad_norm": 0.27717822790145874, "learning_rate": 0.0002, "epoch": 0.481663929939792, "step": 440}, {"loss": 1.1653, "grad_norm": 0.37846508622169495, "learning_rate": 0.0002, "epoch": 0.49261083743842365, "step": 450}, {"loss": 1.1256, "grad_norm": 0.19916996359825134, "learning_rate": 0.0002, "epoch": 0.5035577449370553, "step": 460}, {"loss": 1.1836, "grad_norm": 0.385631799697876, "learning_rate": 0.0002, "epoch": 0.5145046524356869, "step": 470}, {"loss": 1.1549, "grad_norm": 0.3161792457103729, "learning_rate": 0.0002, "epoch": 0.5254515599343186, "step": 480}, {"loss": 1.1032, "grad_norm": 0.3254718780517578, "learning_rate": 0.0002, "epoch": 0.5363984674329502, "step": 490}, {"loss": 1.1116, "grad_norm": 0.42534834146499634, "learning_rate": 0.0002, "epoch": 0.5473453749315819, "step": 500}, {"loss": 1.2216, "grad_norm": 0.29915255308151245, "learning_rate": 0.0002, "epoch": 0.5582922824302134, "step": 510}, {"loss": 1.0444, "grad_norm": 0.31264087557792664, "learning_rate": 0.0002, "epoch": 0.5692391899288451, "step": 520}, {"loss": 1.2547, "grad_norm": 0.43361231684684753, "learning_rate": 0.0002, "epoch": 0.5801860974274767, "step": 530}, {"loss": 1.1655, "grad_norm": 0.3988197147846222, "learning_rate": 0.0002, "epoch": 0.5911330049261084, "step": 540}, {"loss": 1.1138, "grad_norm": 0.35821226239204407, "learning_rate": 0.0002, "epoch": 0.60207991242474, "step": 550}, {"loss": 1.1018, "grad_norm": 0.2939063310623169, "learning_rate": 0.0002, "epoch": 0.6130268199233716, "step": 560}, {"loss": 1.1155, "grad_norm": 0.33339911699295044, "learning_rate": 0.0002, "epoch": 0.6239737274220033, "step": 570}, {"loss": 1.0756, "grad_norm": 0.271671324968338, "learning_rate": 0.0002, "epoch": 0.6349206349206349, "step": 580}, {"loss": 1.117, "grad_norm": 0.3313842713832855, "learning_rate": 0.0002, "epoch": 0.6458675424192666, "step": 590}, {"loss": 1.1381, "grad_norm": 1.0742870569229126, "learning_rate": 0.0002, "epoch": 0.6568144499178982, "step": 600}, {"loss": 1.0613, "grad_norm": 0.27820339798927307, "learning_rate": 0.0002, "epoch": 0.6677613574165299, "step": 610}, {"loss": 1.2129, "grad_norm": 0.3206658959388733, "learning_rate": 0.0002, "epoch": 0.6787082649151615, "step": 620}, {"loss": 1.2172, "grad_norm": 0.32589420676231384, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 630}, {"loss": 1.2451, "grad_norm": 0.28985416889190674, "learning_rate": 0.0002, "epoch": 0.7006020799124247, "step": 640}, {"loss": 1.0927, "grad_norm": 0.348069965839386, "learning_rate": 0.0002, "epoch": 0.7115489874110563, "step": 650}, {"loss": 1.1437, "grad_norm": 0.3746453523635864, "learning_rate": 0.0002, "epoch": 0.722495894909688, "step": 660}, {"loss": 1.1051, "grad_norm": 0.30947357416152954, "learning_rate": 0.0002, "epoch": 0.7334428024083196, "step": 670}, {"loss": 1.2616, "grad_norm": 0.43517717719078064, "learning_rate": 0.0002, "epoch": 0.7443897099069513, "step": 680}, {"loss": 1.0856, "grad_norm": 0.2904577851295471, "learning_rate": 0.0002, "epoch": 0.7553366174055829, "step": 690}, {"loss": 1.2039, "grad_norm": 0.31168660521507263, "learning_rate": 0.0002, "epoch": 0.7662835249042146, "step": 700}, {"loss": 1.0014, "grad_norm": 0.29549655318260193, "learning_rate": 0.0002, "epoch": 0.7772304324028462, "step": 710}, {"loss": 1.2264, "grad_norm": 0.3187733292579651, "learning_rate": 0.0002, "epoch": 0.7881773399014779, "step": 720}, {"loss": 1.246, "grad_norm": 0.26287132501602173, "learning_rate": 0.0002, "epoch": 0.7991242474001095, "step": 730}, {"loss": 1.0605, "grad_norm": 0.32368725538253784, "learning_rate": 0.0002, "epoch": 0.8100711548987412, "step": 740}, {"loss": 1.2512, "grad_norm": 0.37818416953086853, "learning_rate": 0.0002, "epoch": 0.8210180623973727, "step": 750}, {"loss": 1.1609, "grad_norm": 0.3156316578388214, "learning_rate": 0.0002, "epoch": 0.8319649698960043, "step": 760}, {"loss": 1.2619, "grad_norm": 0.4735032916069031, "learning_rate": 0.0002, "epoch": 0.842911877394636, "step": 770}, {"loss": 1.0928, "grad_norm": 0.35200339555740356, "learning_rate": 0.0002, "epoch": 0.8538587848932676, "step": 780}, {"loss": 1.1014, "grad_norm": 0.2805202007293701, "learning_rate": 0.0002, "epoch": 0.8648056923918993, "step": 790}, {"loss": 1.1171, "grad_norm": 0.30746331810951233, "learning_rate": 0.0002, "epoch": 0.8757525998905309, "step": 800}, {"loss": 1.0937, "grad_norm": 0.23246702551841736, "learning_rate": 0.0002, "epoch": 0.8866995073891626, "step": 810}, {"loss": 1.1871, "grad_norm": 0.31561368703842163, "learning_rate": 0.0002, "epoch": 0.8976464148877942, "step": 820}, {"loss": 1.0224, "grad_norm": 0.4917743504047394, "learning_rate": 0.0002, "epoch": 0.9085933223864259, "step": 830}, {"loss": 0.9732, "grad_norm": 0.2250479906797409, "learning_rate": 0.0002, "epoch": 0.9195402298850575, "step": 840}, {"loss": 1.0664, "grad_norm": 0.3228110671043396, "learning_rate": 0.0002, "epoch": 0.9304871373836892, "step": 850}, {"loss": 1.0333, "grad_norm": 0.27888670563697815, "learning_rate": 0.0002, "epoch": 0.9414340448823207, "step": 860}, {"loss": 1.0386, "grad_norm": 0.32544803619384766, "learning_rate": 0.0002, "epoch": 0.9523809523809523, "step": 870}, {"loss": 1.2681, "grad_norm": 0.4826430380344391, "learning_rate": 0.0002, "epoch": 0.963327859879584, "step": 880}, {"loss": 1.1169, "grad_norm": 0.25451406836509705, "learning_rate": 0.0002, "epoch": 0.9742747673782156, "step": 890}, {"loss": 1.0649, "grad_norm": 0.3584475517272949, "learning_rate": 0.0002, "epoch": 0.9852216748768473, "step": 900}, {"loss": 1.1037, "grad_norm": 0.31104281544685364, "learning_rate": 0.0002, "epoch": 0.9961685823754789, "step": 910}]} +{"epoch": 0.9994526546250684, "step": 913, "epoch_duration": 4858.700972080231, "total_accumulated_duration": 4858.700972080231, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4047, "grad_norm": 0.789733350276947, "learning_rate": 0.0002, "epoch": 0.010946907498631636, "step": 10}, {"loss": 1.8779, "grad_norm": 0.47955045104026794, "learning_rate": 0.0002, "epoch": 0.021893814997263273, "step": 20}, {"loss": 1.5739, "grad_norm": 0.588397741317749, "learning_rate": 0.0002, "epoch": 0.03284072249589491, "step": 30}, {"loss": 1.385, "grad_norm": 0.4773741662502289, "learning_rate": 0.0002, "epoch": 0.043787629994526546, "step": 40}, {"loss": 1.3023, "grad_norm": 0.4253212511539459, "learning_rate": 0.0002, "epoch": 0.05473453749315818, "step": 50}, {"loss": 1.24, "grad_norm": 0.3454934060573578, "learning_rate": 0.0002, "epoch": 0.06568144499178982, "step": 60}, {"loss": 1.3092, "grad_norm": 0.40852540731430054, "learning_rate": 0.0002, "epoch": 0.07662835249042145, "step": 70}, {"loss": 1.1656, "grad_norm": 0.6172734498977661, "learning_rate": 0.0002, "epoch": 0.08757525998905309, "step": 80}, {"loss": 1.1724, "grad_norm": 0.3402147889137268, "learning_rate": 0.0002, "epoch": 0.09852216748768473, "step": 90}, {"loss": 1.2754, "grad_norm": 0.3680196702480316, "learning_rate": 0.0002, "epoch": 0.10946907498631636, "step": 100}, {"loss": 1.2169, "grad_norm": 0.39625784754753113, "learning_rate": 0.0002, "epoch": 0.120415982484948, "step": 110}, {"loss": 1.2626, "grad_norm": 0.3857010006904602, "learning_rate": 0.0002, "epoch": 0.13136288998357964, "step": 120}, {"loss": 1.2429, "grad_norm": 0.35471343994140625, "learning_rate": 0.0002, "epoch": 0.1423097974822113, "step": 130}, {"loss": 1.2243, "grad_norm": 0.3129616677761078, "learning_rate": 0.0002, "epoch": 0.1532567049808429, "step": 140}, {"loss": 1.0507, "grad_norm": 0.3068332076072693, "learning_rate": 0.0002, "epoch": 0.16420361247947454, "step": 150}, {"loss": 1.1645, "grad_norm": 0.31592103838920593, "learning_rate": 0.0002, "epoch": 0.17515051997810618, "step": 160}, {"loss": 1.2925, "grad_norm": 0.4139435887336731, "learning_rate": 0.0002, "epoch": 0.18609742747673783, "step": 170}, {"loss": 1.178, "grad_norm": 0.4458570182323456, "learning_rate": 0.0002, "epoch": 0.19704433497536947, "step": 180}, {"loss": 1.0656, "grad_norm": 0.29550260305404663, "learning_rate": 0.0002, "epoch": 0.20799124247400108, "step": 190}, {"loss": 1.0839, "grad_norm": 0.2915268540382385, "learning_rate": 0.0002, "epoch": 0.21893814997263272, "step": 200}, {"loss": 1.1373, "grad_norm": 0.46415194869041443, "learning_rate": 0.0002, "epoch": 0.22988505747126436, "step": 210}, {"loss": 1.1627, "grad_norm": 0.3066752851009369, "learning_rate": 0.0002, "epoch": 0.240831964969896, "step": 220}, {"loss": 1.146, "grad_norm": 0.27449464797973633, "learning_rate": 0.0002, "epoch": 0.25177887246852765, "step": 230}, {"loss": 1.2801, "grad_norm": 0.35353365540504456, "learning_rate": 0.0002, "epoch": 0.2627257799671593, "step": 240}, {"loss": 1.1906, "grad_norm": 0.504181444644928, "learning_rate": 0.0002, "epoch": 0.27367268746579093, "step": 250}, {"loss": 1.1918, "grad_norm": 0.27208924293518066, "learning_rate": 0.0002, "epoch": 0.2846195949644226, "step": 260}, {"loss": 1.1866, "grad_norm": 0.294197678565979, "learning_rate": 0.0002, "epoch": 0.2955665024630542, "step": 270}, {"loss": 1.0601, "grad_norm": 0.28713667392730713, "learning_rate": 0.0002, "epoch": 0.3065134099616858, "step": 280}, {"loss": 1.1749, "grad_norm": 0.3039015829563141, "learning_rate": 0.0002, "epoch": 0.31746031746031744, "step": 290}, {"loss": 1.319, "grad_norm": 0.4140840470790863, "learning_rate": 0.0002, "epoch": 0.3284072249589491, "step": 300}, {"loss": 1.2412, "grad_norm": 0.4447316825389862, "learning_rate": 0.0002, "epoch": 0.3393541324575807, "step": 310}, {"loss": 1.1376, "grad_norm": 0.3732347786426544, "learning_rate": 0.0002, "epoch": 0.35030103995621237, "step": 320}, {"loss": 1.1932, "grad_norm": 0.3674798607826233, "learning_rate": 0.0002, "epoch": 0.361247947454844, "step": 330}, {"loss": 1.193, "grad_norm": 0.39562341570854187, "learning_rate": 0.0002, "epoch": 0.37219485495347565, "step": 340}, {"loss": 1.1608, "grad_norm": 0.3756610155105591, "learning_rate": 0.0002, "epoch": 0.3831417624521073, "step": 350}, {"loss": 1.28, "grad_norm": 0.39729052782058716, "learning_rate": 0.0002, "epoch": 0.39408866995073893, "step": 360}, {"loss": 1.15, "grad_norm": 0.30219244956970215, "learning_rate": 0.0002, "epoch": 0.4050355774493706, "step": 370}, {"loss": 1.1785, "grad_norm": 0.29158663749694824, "learning_rate": 0.0002, "epoch": 0.41598248494800216, "step": 380}, {"loss": 1.1554, "grad_norm": 0.33580854535102844, "learning_rate": 0.0002, "epoch": 0.4269293924466338, "step": 390}, {"loss": 1.1751, "grad_norm": 0.6002793312072754, "learning_rate": 0.0002, "epoch": 0.43787629994526545, "step": 400}, {"loss": 1.1116, "grad_norm": 0.27264222502708435, "learning_rate": 0.0002, "epoch": 0.4488232074438971, "step": 410}, {"loss": 1.127, "grad_norm": 0.29089173674583435, "learning_rate": 0.0002, "epoch": 0.45977011494252873, "step": 420}, {"loss": 1.1732, "grad_norm": 0.26952189207077026, "learning_rate": 0.0002, "epoch": 0.47071702244116037, "step": 430}, {"loss": 1.1211, "grad_norm": 0.27022993564605713, "learning_rate": 0.0002, "epoch": 0.481663929939792, "step": 440}, {"loss": 1.1639, "grad_norm": 0.3734654486179352, "learning_rate": 0.0002, "epoch": 0.49261083743842365, "step": 450}, {"loss": 1.1243, "grad_norm": 0.20133642852306366, "learning_rate": 0.0002, "epoch": 0.5035577449370553, "step": 460}, {"loss": 1.1842, "grad_norm": 0.39844706654548645, "learning_rate": 0.0002, "epoch": 0.5145046524356869, "step": 470}, {"loss": 1.1548, "grad_norm": 0.302224338054657, "learning_rate": 0.0002, "epoch": 0.5254515599343186, "step": 480}, {"loss": 1.1029, "grad_norm": 0.28309789299964905, "learning_rate": 0.0002, "epoch": 0.5363984674329502, "step": 490}, {"loss": 1.1108, "grad_norm": 0.4141816794872284, "learning_rate": 0.0002, "epoch": 0.5473453749315819, "step": 500}, {"loss": 1.2211, "grad_norm": 0.2904048562049866, "learning_rate": 0.0002, "epoch": 0.5582922824302134, "step": 510}, {"loss": 1.0401, "grad_norm": 0.317196249961853, "learning_rate": 0.0002, "epoch": 0.5692391899288451, "step": 520}, {"loss": 1.2524, "grad_norm": 0.42272260785102844, "learning_rate": 0.0002, "epoch": 0.5801860974274767, "step": 530}, {"loss": 1.1669, "grad_norm": 0.4033694863319397, "learning_rate": 0.0002, "epoch": 0.5911330049261084, "step": 540}, {"loss": 1.1153, "grad_norm": 0.3533627688884735, "learning_rate": 0.0002, "epoch": 0.60207991242474, "step": 550}, {"loss": 1.103, "grad_norm": 0.29309213161468506, "learning_rate": 0.0002, "epoch": 0.6130268199233716, "step": 560}, {"loss": 1.1116, "grad_norm": 0.3232715129852295, "learning_rate": 0.0002, "epoch": 0.6239737274220033, "step": 570}, {"loss": 1.0783, "grad_norm": 0.2727753221988678, "learning_rate": 0.0002, "epoch": 0.6349206349206349, "step": 580}, {"loss": 1.1156, "grad_norm": 0.3401601314544678, "learning_rate": 0.0002, "epoch": 0.6458675424192666, "step": 590}, {"loss": 1.1339, "grad_norm": 0.8788238167762756, "learning_rate": 0.0002, "epoch": 0.6568144499178982, "step": 600}, {"loss": 1.0589, "grad_norm": 0.27555420994758606, "learning_rate": 0.0002, "epoch": 0.6677613574165299, "step": 610}, {"loss": 1.2142, "grad_norm": 0.3252176344394684, "learning_rate": 0.0002, "epoch": 0.6787082649151615, "step": 620}, {"loss": 1.224, "grad_norm": 0.3039418160915375, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 630}, {"loss": 1.2451, "grad_norm": 0.2943780720233917, "learning_rate": 0.0002, "epoch": 0.7006020799124247, "step": 640}, {"loss": 1.0915, "grad_norm": 0.33507248759269714, "learning_rate": 0.0002, "epoch": 0.7115489874110563, "step": 650}, {"loss": 1.1423, "grad_norm": 0.3620617091655731, "learning_rate": 0.0002, "epoch": 0.722495894909688, "step": 660}, {"loss": 1.1067, "grad_norm": 0.3180168569087982, "learning_rate": 0.0002, "epoch": 0.7334428024083196, "step": 670}, {"loss": 1.2561, "grad_norm": 0.3375033438205719, "learning_rate": 0.0002, "epoch": 0.7443897099069513, "step": 680}, {"loss": 1.0856, "grad_norm": 0.29707568883895874, "learning_rate": 0.0002, "epoch": 0.7553366174055829, "step": 690}, {"loss": 1.2042, "grad_norm": 0.3066735565662384, "learning_rate": 0.0002, "epoch": 0.7662835249042146, "step": 700}, {"loss": 0.9978, "grad_norm": 0.2804313898086548, "learning_rate": 0.0002, "epoch": 0.7772304324028462, "step": 710}, {"loss": 1.2349, "grad_norm": 0.3300948143005371, "learning_rate": 0.0002, "epoch": 0.7881773399014779, "step": 720}, {"loss": 1.2455, "grad_norm": 0.29191678762435913, "learning_rate": 0.0002, "epoch": 0.7991242474001095, "step": 730}, {"loss": 1.0589, "grad_norm": 0.30186912417411804, "learning_rate": 0.0002, "epoch": 0.8100711548987412, "step": 740}, {"loss": 1.2503, "grad_norm": 0.40812981128692627, "learning_rate": 0.0002, "epoch": 0.8210180623973727, "step": 750}, {"loss": 1.1592, "grad_norm": 0.3179655969142914, "learning_rate": 0.0002, "epoch": 0.8319649698960043, "step": 760}, {"loss": 1.2694, "grad_norm": 0.5074726939201355, "learning_rate": 0.0002, "epoch": 0.842911877394636, "step": 770}, {"loss": 1.0917, "grad_norm": 0.3446512520313263, "learning_rate": 0.0002, "epoch": 0.8538587848932676, "step": 780}, {"loss": 1.1031, "grad_norm": 0.2681335508823395, "learning_rate": 0.0002, "epoch": 0.8648056923918993, "step": 790}, {"loss": 1.1202, "grad_norm": 0.3224271237850189, "learning_rate": 0.0002, "epoch": 0.8757525998905309, "step": 800}, {"loss": 1.0978, "grad_norm": 0.24366702139377594, "learning_rate": 0.0002, "epoch": 0.8866995073891626, "step": 810}, {"loss": 1.1914, "grad_norm": 0.3292049467563629, "learning_rate": 0.0002, "epoch": 0.8976464148877942, "step": 820}, {"loss": 1.0203, "grad_norm": 0.3458293676376343, "learning_rate": 0.0002, "epoch": 0.9085933223864259, "step": 830}, {"loss": 0.9739, "grad_norm": 0.22741839289665222, "learning_rate": 0.0002, "epoch": 0.9195402298850575, "step": 840}, {"loss": 1.0687, "grad_norm": 0.6688203811645508, "learning_rate": 0.0002, "epoch": 0.9304871373836892, "step": 850}, {"loss": 1.0351, "grad_norm": 0.274652361869812, "learning_rate": 0.0002, "epoch": 0.9414340448823207, "step": 860}, {"loss": 1.0416, "grad_norm": 0.3694289028644562, "learning_rate": 0.0002, "epoch": 0.9523809523809523, "step": 870}, {"loss": 1.2681, "grad_norm": 0.4757626950740814, "learning_rate": 0.0002, "epoch": 0.963327859879584, "step": 880}, {"loss": 1.1171, "grad_norm": 0.23742415010929108, "learning_rate": 0.0002, "epoch": 0.9742747673782156, "step": 890}, {"loss": 1.0625, "grad_norm": 0.349394291639328, "learning_rate": 0.0002, "epoch": 0.9852216748768473, "step": 900}, {"loss": 1.0922, "grad_norm": 0.2974051833152771, "learning_rate": 0.0002, "epoch": 0.9961685823754789, "step": 910}]} +{"epoch": 0.9994526546250684, "step": 913, "epoch_duration": 4803.790252447128, "total_accumulated_duration": 4803.790252447128, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4028, "grad_norm": 0.795521080493927, "learning_rate": 0.0002, "epoch": 0.010946907498631636, "step": 10}, {"loss": 1.8791, "grad_norm": 0.4686722159385681, "learning_rate": 0.0002, "epoch": 0.021893814997263273, "step": 20}, {"loss": 1.5831, "grad_norm": 0.5660933256149292, "learning_rate": 0.0002, "epoch": 0.03284072249589491, "step": 30}, {"loss": 1.3807, "grad_norm": 1.227329969406128, "learning_rate": 0.0002, "epoch": 0.043787629994526546, "step": 40}, {"loss": 1.3097, "grad_norm": 0.5239948630332947, "learning_rate": 0.0002, "epoch": 0.05473453749315818, "step": 50}, {"loss": 1.2397, "grad_norm": 0.3550681173801422, "learning_rate": 0.0002, "epoch": 0.06568144499178982, "step": 60}, {"loss": 1.3199, "grad_norm": 0.5845893621444702, "learning_rate": 0.0002, "epoch": 0.07662835249042145, "step": 70}, {"loss": 1.1702, "grad_norm": 0.6099892854690552, "learning_rate": 0.0002, "epoch": 0.08757525998905309, "step": 80}, {"loss": 1.1724, "grad_norm": 0.33250030875205994, "learning_rate": 0.0002, "epoch": 0.09852216748768473, "step": 90}, {"loss": 1.2806, "grad_norm": 0.5137067437171936, "learning_rate": 0.0002, "epoch": 0.10946907498631636, "step": 100}, {"loss": 1.2112, "grad_norm": 0.40557053685188293, "learning_rate": 0.0002, "epoch": 0.120415982484948, "step": 110}, {"loss": 1.2597, "grad_norm": 0.3856102526187897, "learning_rate": 0.0002, "epoch": 0.13136288998357964, "step": 120}, {"loss": 1.2382, "grad_norm": 0.3643055856227875, "learning_rate": 0.0002, "epoch": 0.1423097974822113, "step": 130}, {"loss": 1.2199, "grad_norm": 0.3158377408981323, "learning_rate": 0.0002, "epoch": 0.1532567049808429, "step": 140}, {"loss": 1.0512, "grad_norm": 0.31467798352241516, "learning_rate": 0.0002, "epoch": 0.16420361247947454, "step": 150}, {"loss": 1.1622, "grad_norm": 0.33398908376693726, "learning_rate": 0.0002, "epoch": 0.17515051997810618, "step": 160}, {"loss": 1.2859, "grad_norm": 0.38519906997680664, "learning_rate": 0.0002, "epoch": 0.18609742747673783, "step": 170}, {"loss": 1.1762, "grad_norm": 0.36663818359375, "learning_rate": 0.0002, "epoch": 0.19704433497536947, "step": 180}, {"loss": 1.067, "grad_norm": 0.282659113407135, "learning_rate": 0.0002, "epoch": 0.20799124247400108, "step": 190}, {"loss": 1.079, "grad_norm": 0.2818695604801178, "learning_rate": 0.0002, "epoch": 0.21893814997263272, "step": 200}, {"loss": 1.1359, "grad_norm": 0.39851438999176025, "learning_rate": 0.0002, "epoch": 0.22988505747126436, "step": 210}, {"loss": 1.1538, "grad_norm": 0.31774890422821045, "learning_rate": 0.0002, "epoch": 0.240831964969896, "step": 220}, {"loss": 1.1428, "grad_norm": 0.2777709364891052, "learning_rate": 0.0002, "epoch": 0.25177887246852765, "step": 230}, {"loss": 1.2741, "grad_norm": 0.3904445469379425, "learning_rate": 0.0002, "epoch": 0.2627257799671593, "step": 240}, {"loss": 1.1889, "grad_norm": 0.4349442720413208, "learning_rate": 0.0002, "epoch": 0.27367268746579093, "step": 250}, {"loss": 1.1855, "grad_norm": 0.2749781310558319, "learning_rate": 0.0002, "epoch": 0.2846195949644226, "step": 260}, {"loss": 1.1879, "grad_norm": 0.3258635699748993, "learning_rate": 0.0002, "epoch": 0.2955665024630542, "step": 270}, {"loss": 1.0624, "grad_norm": 0.30227264761924744, "learning_rate": 0.0002, "epoch": 0.3065134099616858, "step": 280}, {"loss": 1.1725, "grad_norm": 0.3817561864852905, "learning_rate": 0.0002, "epoch": 0.31746031746031744, "step": 290}, {"loss": 1.3129, "grad_norm": 0.39917275309562683, "learning_rate": 0.0002, "epoch": 0.3284072249589491, "step": 300}, {"loss": 1.2445, "grad_norm": 0.42792218923568726, "learning_rate": 0.0002, "epoch": 0.3393541324575807, "step": 310}, {"loss": 1.1396, "grad_norm": 0.4719575345516205, "learning_rate": 0.0002, "epoch": 0.35030103995621237, "step": 320}, {"loss": 1.1918, "grad_norm": 0.2811720371246338, "learning_rate": 0.0002, "epoch": 0.361247947454844, "step": 330}, {"loss": 1.195, "grad_norm": 0.3931334912776947, "learning_rate": 0.0002, "epoch": 0.37219485495347565, "step": 340}, {"loss": 1.1617, "grad_norm": 0.4623820185661316, "learning_rate": 0.0002, "epoch": 0.3831417624521073, "step": 350}, {"loss": 1.2825, "grad_norm": 0.45300203561782837, "learning_rate": 0.0002, "epoch": 0.39408866995073893, "step": 360}, {"loss": 1.151, "grad_norm": 0.3399786353111267, "learning_rate": 0.0002, "epoch": 0.4050355774493706, "step": 370}, {"loss": 1.18, "grad_norm": 0.3179274797439575, "learning_rate": 0.0002, "epoch": 0.41598248494800216, "step": 380}, {"loss": 1.1562, "grad_norm": 0.37702685594558716, "learning_rate": 0.0002, "epoch": 0.4269293924466338, "step": 390}, {"loss": 1.174, "grad_norm": 0.5325851440429688, "learning_rate": 0.0002, "epoch": 0.43787629994526545, "step": 400}, {"loss": 1.1093, "grad_norm": 0.266667902469635, "learning_rate": 0.0002, "epoch": 0.4488232074438971, "step": 410}, {"loss": 1.1297, "grad_norm": 0.3148106634616852, "learning_rate": 0.0002, "epoch": 0.45977011494252873, "step": 420}, {"loss": 1.1729, "grad_norm": 0.26062336564064026, "learning_rate": 0.0002, "epoch": 0.47071702244116037, "step": 430}, {"loss": 1.1211, "grad_norm": 0.2696126103401184, "learning_rate": 0.0002, "epoch": 0.481663929939792, "step": 440}, {"loss": 1.1657, "grad_norm": 0.3823620676994324, "learning_rate": 0.0002, "epoch": 0.49261083743842365, "step": 450}, {"loss": 1.1242, "grad_norm": 0.3216341435909271, "learning_rate": 0.0002, "epoch": 0.5035577449370553, "step": 460}, {"loss": 1.1854, "grad_norm": 0.39548951387405396, "learning_rate": 0.0002, "epoch": 0.5145046524356869, "step": 470}, {"loss": 1.1529, "grad_norm": 0.3132643699645996, "learning_rate": 0.0002, "epoch": 0.5254515599343186, "step": 480}, {"loss": 1.0981, "grad_norm": 0.29527464509010315, "learning_rate": 0.0002, "epoch": 0.5363984674329502, "step": 490}, {"loss": 1.1112, "grad_norm": 0.386337012052536, "learning_rate": 0.0002, "epoch": 0.5473453749315819, "step": 500}, {"loss": 1.2175, "grad_norm": 0.3230147957801819, "learning_rate": 0.0002, "epoch": 0.5582922824302134, "step": 510}, {"loss": 1.0429, "grad_norm": 0.3255913257598877, "learning_rate": 0.0002, "epoch": 0.5692391899288451, "step": 520}, {"loss": 1.2527, "grad_norm": 0.42526349425315857, "learning_rate": 0.0002, "epoch": 0.5801860974274767, "step": 530}, {"loss": 1.1682, "grad_norm": 0.3711594045162201, "learning_rate": 0.0002, "epoch": 0.5911330049261084, "step": 540}, {"loss": 1.114, "grad_norm": 0.3699112832546234, "learning_rate": 0.0002, "epoch": 0.60207991242474, "step": 550}, {"loss": 1.1013, "grad_norm": 0.29559260606765747, "learning_rate": 0.0002, "epoch": 0.6130268199233716, "step": 560}, {"loss": 1.1161, "grad_norm": 0.35342633724212646, "learning_rate": 0.0002, "epoch": 0.6239737274220033, "step": 570}, {"loss": 1.0747, "grad_norm": 0.27879607677459717, "learning_rate": 0.0002, "epoch": 0.6349206349206349, "step": 580}, {"loss": 1.1178, "grad_norm": 0.34217947721481323, "learning_rate": 0.0002, "epoch": 0.6458675424192666, "step": 590}, {"loss": 1.1317, "grad_norm": 0.9936075806617737, "learning_rate": 0.0002, "epoch": 0.6568144499178982, "step": 600}, {"loss": 1.0571, "grad_norm": 0.27797362208366394, "learning_rate": 0.0002, "epoch": 0.6677613574165299, "step": 610}, {"loss": 1.2133, "grad_norm": 0.3185802400112152, "learning_rate": 0.0002, "epoch": 0.6787082649151615, "step": 620}, {"loss": 1.2187, "grad_norm": 0.3263399302959442, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 630}, {"loss": 1.2469, "grad_norm": 0.3052294850349426, "learning_rate": 0.0002, "epoch": 0.7006020799124247, "step": 640}, {"loss": 1.0882, "grad_norm": 0.3395131826400757, "learning_rate": 0.0002, "epoch": 0.7115489874110563, "step": 650}, {"loss": 1.1404, "grad_norm": 0.35718032717704773, "learning_rate": 0.0002, "epoch": 0.722495894909688, "step": 660}, {"loss": 1.1039, "grad_norm": 0.37328389286994934, "learning_rate": 0.0002, "epoch": 0.7334428024083196, "step": 670}, {"loss": 1.2621, "grad_norm": 0.3185192048549652, "learning_rate": 0.0002, "epoch": 0.7443897099069513, "step": 680}, {"loss": 1.0847, "grad_norm": 0.2795262932777405, "learning_rate": 0.0002, "epoch": 0.7553366174055829, "step": 690}, {"loss": 1.2064, "grad_norm": 0.29796719551086426, "learning_rate": 0.0002, "epoch": 0.7662835249042146, "step": 700}, {"loss": 0.9993, "grad_norm": 0.3695313632488251, "learning_rate": 0.0002, "epoch": 0.7772304324028462, "step": 710}, {"loss": 1.2245, "grad_norm": 0.31174781918525696, "learning_rate": 0.0002, "epoch": 0.7881773399014779, "step": 720}, {"loss": 1.2443, "grad_norm": 0.2640160620212555, "learning_rate": 0.0002, "epoch": 0.7991242474001095, "step": 730}, {"loss": 1.0605, "grad_norm": 0.31823453307151794, "learning_rate": 0.0002, "epoch": 0.8100711548987412, "step": 740}, {"loss": 1.2524, "grad_norm": 0.46503135561943054, "learning_rate": 0.0002, "epoch": 0.8210180623973727, "step": 750}, {"loss": 1.1609, "grad_norm": 0.3179810345172882, "learning_rate": 0.0002, "epoch": 0.8319649698960043, "step": 760}, {"loss": 1.2593, "grad_norm": 0.4537338614463806, "learning_rate": 0.0002, "epoch": 0.842911877394636, "step": 770}, {"loss": 1.0941, "grad_norm": 0.4113718867301941, "learning_rate": 0.0002, "epoch": 0.8538587848932676, "step": 780}, {"loss": 1.1023, "grad_norm": 0.2963421046733856, "learning_rate": 0.0002, "epoch": 0.8648056923918993, "step": 790}, {"loss": 1.1194, "grad_norm": 0.33587151765823364, "learning_rate": 0.0002, "epoch": 0.8757525998905309, "step": 800}, {"loss": 1.0984, "grad_norm": 0.23537468910217285, "learning_rate": 0.0002, "epoch": 0.8866995073891626, "step": 810}, {"loss": 1.1927, "grad_norm": 0.31468865275382996, "learning_rate": 0.0002, "epoch": 0.8976464148877942, "step": 820}, {"loss": 1.0221, "grad_norm": 0.336098313331604, "learning_rate": 0.0002, "epoch": 0.9085933223864259, "step": 830}, {"loss": 0.9768, "grad_norm": 0.2583458721637726, "learning_rate": 0.0002, "epoch": 0.9195402298850575, "step": 840}, {"loss": 1.0669, "grad_norm": 0.3173564076423645, "learning_rate": 0.0002, "epoch": 0.9304871373836892, "step": 850}, {"loss": 1.0359, "grad_norm": 0.27985909581184387, "learning_rate": 0.0002, "epoch": 0.9414340448823207, "step": 860}, {"loss": 1.0416, "grad_norm": 0.3379824459552765, "learning_rate": 0.0002, "epoch": 0.9523809523809523, "step": 870}, {"loss": 1.2637, "grad_norm": 0.4532661736011505, "learning_rate": 0.0002, "epoch": 0.963327859879584, "step": 880}, {"loss": 1.1143, "grad_norm": 0.24834831058979034, "learning_rate": 0.0002, "epoch": 0.9742747673782156, "step": 890}, {"loss": 1.0629, "grad_norm": 0.2924191951751709, "learning_rate": 0.0002, "epoch": 0.9852216748768473, "step": 900}, {"loss": 1.0929, "grad_norm": 0.29473212361335754, "learning_rate": 0.0002, "epoch": 0.9961685823754789, "step": 910}]} +{"epoch": 0.9994526546250684, "step": 913, "epoch_duration": 3111.739435195923, "total_accumulated_duration": 3111.739435195923, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4064, "grad_norm": 0.8315525650978088, "learning_rate": 0.0002, "epoch": 0.010946907498631636, "step": 10}, {"loss": 1.8858, "grad_norm": 0.4768979549407959, "learning_rate": 0.0002, "epoch": 0.021893814997263273, "step": 20}, {"loss": 1.598, "grad_norm": 0.4891189634799957, "learning_rate": 0.0002, "epoch": 0.03284072249589491, "step": 30}, {"loss": 1.3819, "grad_norm": 0.5654677152633667, "learning_rate": 0.0002, "epoch": 0.043787629994526546, "step": 40}, {"loss": 1.3084, "grad_norm": 0.4613725244998932, "learning_rate": 0.0002, "epoch": 0.05473453749315818, "step": 50}, {"loss": 1.2362, "grad_norm": 0.3586723208427429, "learning_rate": 0.0002, "epoch": 0.06568144499178982, "step": 60}, {"loss": 1.3189, "grad_norm": 0.4401360750198364, "learning_rate": 0.0002, "epoch": 0.07662835249042145, "step": 70}, {"loss": 1.1714, "grad_norm": 0.9771654605865479, "learning_rate": 0.0002, "epoch": 0.08757525998905309, "step": 80}, {"loss": 1.1746, "grad_norm": 0.34469878673553467, "learning_rate": 0.0002, "epoch": 0.09852216748768473, "step": 90}, {"loss": 1.2769, "grad_norm": 0.3707507848739624, "learning_rate": 0.0002, "epoch": 0.10946907498631636, "step": 100}, {"loss": 1.2157, "grad_norm": 0.4154447615146637, "learning_rate": 0.0002, "epoch": 0.120415982484948, "step": 110}, {"loss": 1.2554, "grad_norm": 0.7811278700828552, "learning_rate": 0.0002, "epoch": 0.13136288998357964, "step": 120}, {"loss": 1.2418, "grad_norm": 0.3964453339576721, "learning_rate": 0.0002, "epoch": 0.1423097974822113, "step": 130}, {"loss": 1.221, "grad_norm": 0.32972392439842224, "learning_rate": 0.0002, "epoch": 0.1532567049808429, "step": 140}, {"loss": 1.0478, "grad_norm": 0.328593373298645, "learning_rate": 0.0002, "epoch": 0.16420361247947454, "step": 150}, {"loss": 1.1592, "grad_norm": 0.33038654923439026, "learning_rate": 0.0002, "epoch": 0.17515051997810618, "step": 160}, {"loss": 1.2844, "grad_norm": 0.4016525149345398, "learning_rate": 0.0002, "epoch": 0.18609742747673783, "step": 170}, {"loss": 1.172, "grad_norm": 0.38213640451431274, "learning_rate": 0.0002, "epoch": 0.19704433497536947, "step": 180}, {"loss": 1.0633, "grad_norm": 0.2944932281970978, "learning_rate": 0.0002, "epoch": 0.20799124247400108, "step": 190}, {"loss": 1.0803, "grad_norm": 0.2967860698699951, "learning_rate": 0.0002, "epoch": 0.21893814997263272, "step": 200}, {"loss": 1.1353, "grad_norm": 0.6144160032272339, "learning_rate": 0.0002, "epoch": 0.22988505747126436, "step": 210}, {"loss": 1.1559, "grad_norm": 0.3172452747821808, "learning_rate": 0.0002, "epoch": 0.240831964969896, "step": 220}, {"loss": 1.1414, "grad_norm": 0.29329076409339905, "learning_rate": 0.0002, "epoch": 0.25177887246852765, "step": 230}, {"loss": 1.2705, "grad_norm": 0.3799499273300171, "learning_rate": 0.0002, "epoch": 0.2627257799671593, "step": 240}, {"loss": 1.1869, "grad_norm": 0.3606453835964203, "learning_rate": 0.0002, "epoch": 0.27367268746579093, "step": 250}, {"loss": 1.1898, "grad_norm": 0.2818147540092468, "learning_rate": 0.0002, "epoch": 0.2846195949644226, "step": 260}, {"loss": 1.1907, "grad_norm": 0.43860194087028503, "learning_rate": 0.0002, "epoch": 0.2955665024630542, "step": 270}, {"loss": 1.0607, "grad_norm": 0.2921052575111389, "learning_rate": 0.0002, "epoch": 0.3065134099616858, "step": 280}, {"loss": 1.1744, "grad_norm": 0.30841436982154846, "learning_rate": 0.0002, "epoch": 0.31746031746031744, "step": 290}, {"loss": 1.3094, "grad_norm": 0.6104546785354614, "learning_rate": 0.0002, "epoch": 0.3284072249589491, "step": 300}, {"loss": 1.244, "grad_norm": 1.3919378519058228, "learning_rate": 0.0002, "epoch": 0.3393541324575807, "step": 310}, {"loss": 1.1422, "grad_norm": 0.4001498222351074, "learning_rate": 0.0002, "epoch": 0.35030103995621237, "step": 320}, {"loss": 1.1936, "grad_norm": 0.2854520380496979, "learning_rate": 0.0002, "epoch": 0.361247947454844, "step": 330}, {"loss": 1.2009, "grad_norm": 0.361251562833786, "learning_rate": 0.0002, "epoch": 0.37219485495347565, "step": 340}, {"loss": 1.1582, "grad_norm": 0.3750649690628052, "learning_rate": 0.0002, "epoch": 0.3831417624521073, "step": 350}, {"loss": 1.2793, "grad_norm": 0.40161052346229553, "learning_rate": 0.0002, "epoch": 0.39408866995073893, "step": 360}, {"loss": 1.1501, "grad_norm": 0.3158400058746338, "learning_rate": 0.0002, "epoch": 0.4050355774493706, "step": 370}, {"loss": 1.1825, "grad_norm": 0.30031272768974304, "learning_rate": 0.0002, "epoch": 0.41598248494800216, "step": 380}, {"loss": 1.1562, "grad_norm": 0.33606818318367004, "learning_rate": 0.0002, "epoch": 0.4269293924466338, "step": 390}, {"loss": 1.1765, "grad_norm": 0.49571487307548523, "learning_rate": 0.0002, "epoch": 0.43787629994526545, "step": 400}, {"loss": 1.1089, "grad_norm": 0.28612610697746277, "learning_rate": 0.0002, "epoch": 0.4488232074438971, "step": 410}, {"loss": 1.1275, "grad_norm": 0.29380208253860474, "learning_rate": 0.0002, "epoch": 0.45977011494252873, "step": 420}, {"loss": 1.173, "grad_norm": 0.2754780054092407, "learning_rate": 0.0002, "epoch": 0.47071702244116037, "step": 430}, {"loss": 1.1198, "grad_norm": 0.7208526134490967, "learning_rate": 0.0002, "epoch": 0.481663929939792, "step": 440}, {"loss": 1.1645, "grad_norm": 0.5081837773323059, "learning_rate": 0.0002, "epoch": 0.49261083743842365, "step": 450}, {"loss": 1.1279, "grad_norm": 0.20326904952526093, "learning_rate": 0.0002, "epoch": 0.5035577449370553, "step": 460}, {"loss": 1.1823, "grad_norm": 0.368084192276001, "learning_rate": 0.0002, "epoch": 0.5145046524356869, "step": 470}, {"loss": 1.1567, "grad_norm": 0.3025706112384796, "learning_rate": 0.0002, "epoch": 0.5254515599343186, "step": 480}, {"loss": 1.101, "grad_norm": 0.3122895658016205, "learning_rate": 0.0002, "epoch": 0.5363984674329502, "step": 490}, {"loss": 1.1089, "grad_norm": 0.4389672577381134, "learning_rate": 0.0002, "epoch": 0.5473453749315819, "step": 500}, {"loss": 1.2199, "grad_norm": 0.38763660192489624, "learning_rate": 0.0002, "epoch": 0.5582922824302134, "step": 510}, {"loss": 1.042, "grad_norm": 0.32445019483566284, "learning_rate": 0.0002, "epoch": 0.5692391899288451, "step": 520}, {"loss": 1.2558, "grad_norm": 0.40850868821144104, "learning_rate": 0.0002, "epoch": 0.5801860974274767, "step": 530}, {"loss": 1.1671, "grad_norm": 0.4462431073188782, "learning_rate": 0.0002, "epoch": 0.5911330049261084, "step": 540}, {"loss": 1.1108, "grad_norm": 0.3874809741973877, "learning_rate": 0.0002, "epoch": 0.60207991242474, "step": 550}, {"loss": 1.1014, "grad_norm": 0.3020818531513214, "learning_rate": 0.0002, "epoch": 0.6130268199233716, "step": 560}, {"loss": 1.1136, "grad_norm": 0.31303995847702026, "learning_rate": 0.0002, "epoch": 0.6239737274220033, "step": 570}, {"loss": 1.076, "grad_norm": 0.26297080516815186, "learning_rate": 0.0002, "epoch": 0.6349206349206349, "step": 580}, {"loss": 1.1163, "grad_norm": 0.3534374535083771, "learning_rate": 0.0002, "epoch": 0.6458675424192666, "step": 590}, {"loss": 1.1344, "grad_norm": 0.6948974132537842, "learning_rate": 0.0002, "epoch": 0.6568144499178982, "step": 600}, {"loss": 1.0619, "grad_norm": 0.27238744497299194, "learning_rate": 0.0002, "epoch": 0.6677613574165299, "step": 610}, {"loss": 1.2142, "grad_norm": 0.3345783054828644, "learning_rate": 0.0002, "epoch": 0.6787082649151615, "step": 620}, {"loss": 1.2194, "grad_norm": 0.34639596939086914, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 630}, {"loss": 1.2466, "grad_norm": 0.28842809796333313, "learning_rate": 0.0002, "epoch": 0.7006020799124247, "step": 640}, {"loss": 1.0916, "grad_norm": 0.3722135126590729, "learning_rate": 0.0002, "epoch": 0.7115489874110563, "step": 650}, {"loss": 1.1421, "grad_norm": 0.36706414818763733, "learning_rate": 0.0002, "epoch": 0.722495894909688, "step": 660}, {"loss": 1.1063, "grad_norm": 0.2937333881855011, "learning_rate": 0.0002, "epoch": 0.7334428024083196, "step": 670}, {"loss": 1.2621, "grad_norm": 0.3463886082172394, "learning_rate": 0.0002, "epoch": 0.7443897099069513, "step": 680}, {"loss": 1.0862, "grad_norm": 0.2808016538619995, "learning_rate": 0.0002, "epoch": 0.7553366174055829, "step": 690}, {"loss": 1.2038, "grad_norm": 0.3184432089328766, "learning_rate": 0.0002, "epoch": 0.7662835249042146, "step": 700}, {"loss": 0.9983, "grad_norm": 0.29998502135276794, "learning_rate": 0.0002, "epoch": 0.7772304324028462, "step": 710}, {"loss": 1.2272, "grad_norm": 0.32185453176498413, "learning_rate": 0.0002, "epoch": 0.7881773399014779, "step": 720}, {"loss": 1.2452, "grad_norm": 0.27004799246788025, "learning_rate": 0.0002, "epoch": 0.7991242474001095, "step": 730}, {"loss": 1.0642, "grad_norm": 0.3320857882499695, "learning_rate": 0.0002, "epoch": 0.8100711548987412, "step": 740}, {"loss": 1.2558, "grad_norm": 0.37585026025772095, "learning_rate": 0.0002, "epoch": 0.8210180623973727, "step": 750}, {"loss": 1.16, "grad_norm": 0.32140469551086426, "learning_rate": 0.0002, "epoch": 0.8319649698960043, "step": 760}, {"loss": 1.2662, "grad_norm": 0.418083518743515, "learning_rate": 0.0002, "epoch": 0.842911877394636, "step": 770}, {"loss": 1.0936, "grad_norm": 0.42758217453956604, "learning_rate": 0.0002, "epoch": 0.8538587848932676, "step": 780}, {"loss": 1.1041, "grad_norm": 0.27828508615493774, "learning_rate": 0.0002, "epoch": 0.8648056923918993, "step": 790}, {"loss": 1.119, "grad_norm": 0.3195190131664276, "learning_rate": 0.0002, "epoch": 0.8757525998905309, "step": 800}, {"loss": 1.0979, "grad_norm": 0.2453049123287201, "learning_rate": 0.0002, "epoch": 0.8866995073891626, "step": 810}, {"loss": 1.1904, "grad_norm": 0.3614264726638794, "learning_rate": 0.0002, "epoch": 0.8976464148877942, "step": 820}, {"loss": 1.0221, "grad_norm": 0.3742782175540924, "learning_rate": 0.0002, "epoch": 0.9085933223864259, "step": 830}, {"loss": 0.9742, "grad_norm": 0.22376364469528198, "learning_rate": 0.0002, "epoch": 0.9195402298850575, "step": 840}, {"loss": 1.0657, "grad_norm": 0.4903332591056824, "learning_rate": 0.0002, "epoch": 0.9304871373836892, "step": 850}, {"loss": 1.0393, "grad_norm": 0.2924495339393616, "learning_rate": 0.0002, "epoch": 0.9414340448823207, "step": 860}, {"loss": 1.0412, "grad_norm": 0.3286125063896179, "learning_rate": 0.0002, "epoch": 0.9523809523809523, "step": 870}, {"loss": 1.2697, "grad_norm": 0.45306113362312317, "learning_rate": 0.0002, "epoch": 0.963327859879584, "step": 880}, {"loss": 1.1151, "grad_norm": 0.236796036362648, "learning_rate": 0.0002, "epoch": 0.9742747673782156, "step": 890}, {"loss": 1.0625, "grad_norm": 0.25988534092903137, "learning_rate": 0.0002, "epoch": 0.9852216748768473, "step": 900}, {"loss": 1.0953, "grad_norm": 0.2976899743080139, "learning_rate": 0.0002, "epoch": 0.9961685823754789, "step": 910}]} +{"epoch": 0.9994526546250684, "step": 913, "epoch_duration": 4324.058840036392, "total_accumulated_duration": 4324.058840036392, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4025, "grad_norm": 0.8413946628570557, "learning_rate": 0.0002, "epoch": 0.010946907498631636, "step": 10}, {"loss": 1.8765, "grad_norm": 0.4706946611404419, "learning_rate": 0.0002, "epoch": 0.021893814997263273, "step": 20}, {"loss": 1.5774, "grad_norm": 0.592414140701294, "learning_rate": 0.0002, "epoch": 0.03284072249589491, "step": 30}, {"loss": 1.3784, "grad_norm": 0.5266873240470886, "learning_rate": 0.0002, "epoch": 0.043787629994526546, "step": 40}, {"loss": 1.3048, "grad_norm": 0.42714616656303406, "learning_rate": 0.0002, "epoch": 0.05473453749315818, "step": 50}, {"loss": 1.2416, "grad_norm": 0.33732205629348755, "learning_rate": 0.0002, "epoch": 0.06568144499178982, "step": 60}, {"loss": 1.3164, "grad_norm": 0.3919900059700012, "learning_rate": 0.0002, "epoch": 0.07662835249042145, "step": 70}, {"loss": 1.1686, "grad_norm": 0.6971392035484314, "learning_rate": 0.0002, "epoch": 0.08757525998905309, "step": 80}, {"loss": 1.1688, "grad_norm": 0.351304829120636, "learning_rate": 0.0002, "epoch": 0.09852216748768473, "step": 90}, {"loss": 1.2758, "grad_norm": 0.36268818378448486, "learning_rate": 0.0002, "epoch": 0.10946907498631636, "step": 100}, {"loss": 1.2094, "grad_norm": 0.41704121232032776, "learning_rate": 0.0002, "epoch": 0.120415982484948, "step": 110}, {"loss": 1.2561, "grad_norm": 0.3970317542552948, "learning_rate": 0.0002, "epoch": 0.13136288998357964, "step": 120}, {"loss": 1.2395, "grad_norm": 0.3852194845676422, "learning_rate": 0.0002, "epoch": 0.1423097974822113, "step": 130}, {"loss": 1.2199, "grad_norm": 0.30831000208854675, "learning_rate": 0.0002, "epoch": 0.1532567049808429, "step": 140}, {"loss": 1.0462, "grad_norm": 0.31121566891670227, "learning_rate": 0.0002, "epoch": 0.16420361247947454, "step": 150}, {"loss": 1.1639, "grad_norm": 0.34039783477783203, "learning_rate": 0.0002, "epoch": 0.17515051997810618, "step": 160}, {"loss": 1.2867, "grad_norm": 0.36704662442207336, "learning_rate": 0.0002, "epoch": 0.18609742747673783, "step": 170}, {"loss": 1.1745, "grad_norm": 0.36384010314941406, "learning_rate": 0.0002, "epoch": 0.19704433497536947, "step": 180}, {"loss": 1.0667, "grad_norm": 0.31086239218711853, "learning_rate": 0.0002, "epoch": 0.20799124247400108, "step": 190}, {"loss": 1.0798, "grad_norm": 0.35405558347702026, "learning_rate": 0.0002, "epoch": 0.21893814997263272, "step": 200}, {"loss": 1.1348, "grad_norm": 0.6467314958572388, "learning_rate": 0.0002, "epoch": 0.22988505747126436, "step": 210}, {"loss": 1.1532, "grad_norm": 0.32341739535331726, "learning_rate": 0.0002, "epoch": 0.240831964969896, "step": 220}, {"loss": 1.1381, "grad_norm": 0.27438628673553467, "learning_rate": 0.0002, "epoch": 0.25177887246852765, "step": 230}, {"loss": 1.272, "grad_norm": 0.37504860758781433, "learning_rate": 0.0002, "epoch": 0.2627257799671593, "step": 240}, {"loss": 1.1841, "grad_norm": 0.38969483971595764, "learning_rate": 0.0002, "epoch": 0.27367268746579093, "step": 250}, {"loss": 1.1885, "grad_norm": 0.27647948265075684, "learning_rate": 0.0002, "epoch": 0.2846195949644226, "step": 260}, {"loss": 1.1869, "grad_norm": 0.30249834060668945, "learning_rate": 0.0002, "epoch": 0.2955665024630542, "step": 270}, {"loss": 1.058, "grad_norm": 1.0534214973449707, "learning_rate": 0.0002, "epoch": 0.3065134099616858, "step": 280}, {"loss": 1.1712, "grad_norm": 0.30905938148498535, "learning_rate": 0.0002, "epoch": 0.31746031746031744, "step": 290}, {"loss": 1.3124, "grad_norm": 0.8192272186279297, "learning_rate": 0.0002, "epoch": 0.3284072249589491, "step": 300}, {"loss": 1.2442, "grad_norm": 0.7817836999893188, "learning_rate": 0.0002, "epoch": 0.3393541324575807, "step": 310}, {"loss": 1.1415, "grad_norm": 0.36925482749938965, "learning_rate": 0.0002, "epoch": 0.35030103995621237, "step": 320}, {"loss": 1.1926, "grad_norm": 0.2638542652130127, "learning_rate": 0.0002, "epoch": 0.361247947454844, "step": 330}, {"loss": 1.2021, "grad_norm": 0.39350607991218567, "learning_rate": 0.0002, "epoch": 0.37219485495347565, "step": 340}, {"loss": 1.1669, "grad_norm": 0.40172526240348816, "learning_rate": 0.0002, "epoch": 0.3831417624521073, "step": 350}, {"loss": 1.2842, "grad_norm": 0.8559852838516235, "learning_rate": 0.0002, "epoch": 0.39408866995073893, "step": 360}, {"loss": 1.1495, "grad_norm": 0.3007528781890869, "learning_rate": 0.0002, "epoch": 0.4050355774493706, "step": 370}, {"loss": 1.1819, "grad_norm": 0.3273051083087921, "learning_rate": 0.0002, "epoch": 0.41598248494800216, "step": 380}, {"loss": 1.1567, "grad_norm": 0.35510483384132385, "learning_rate": 0.0002, "epoch": 0.4269293924466338, "step": 390}, {"loss": 1.1748, "grad_norm": 0.574571430683136, "learning_rate": 0.0002, "epoch": 0.43787629994526545, "step": 400}, {"loss": 1.105, "grad_norm": 0.26277998089790344, "learning_rate": 0.0002, "epoch": 0.4488232074438971, "step": 410}, {"loss": 1.1307, "grad_norm": 0.2898577153682709, "learning_rate": 0.0002, "epoch": 0.45977011494252873, "step": 420}, {"loss": 1.1721, "grad_norm": 0.26525044441223145, "learning_rate": 0.0002, "epoch": 0.47071702244116037, "step": 430}, {"loss": 1.1192, "grad_norm": 0.26191475987434387, "learning_rate": 0.0002, "epoch": 0.481663929939792, "step": 440}, {"loss": 1.1653, "grad_norm": 0.4236069619655609, "learning_rate": 0.0002, "epoch": 0.49261083743842365, "step": 450}, {"loss": 1.1258, "grad_norm": 0.21045465767383575, "learning_rate": 0.0002, "epoch": 0.5035577449370553, "step": 460}, {"loss": 1.182, "grad_norm": 0.3767169415950775, "learning_rate": 0.0002, "epoch": 0.5145046524356869, "step": 470}, {"loss": 1.1549, "grad_norm": 0.30658408999443054, "learning_rate": 0.0002, "epoch": 0.5254515599343186, "step": 480}, {"loss": 1.1038, "grad_norm": 0.2885505259037018, "learning_rate": 0.0002, "epoch": 0.5363984674329502, "step": 490}, {"loss": 1.1115, "grad_norm": 0.3905075192451477, "learning_rate": 0.0002, "epoch": 0.5473453749315819, "step": 500}, {"loss": 1.219, "grad_norm": 0.3061444163322449, "learning_rate": 0.0002, "epoch": 0.5582922824302134, "step": 510}, {"loss": 1.0396, "grad_norm": 0.34073352813720703, "learning_rate": 0.0002, "epoch": 0.5692391899288451, "step": 520}, {"loss": 1.2557, "grad_norm": 0.43636074662208557, "learning_rate": 0.0002, "epoch": 0.5801860974274767, "step": 530}, {"loss": 1.1711, "grad_norm": 0.48204460740089417, "learning_rate": 0.0002, "epoch": 0.5911330049261084, "step": 540}, {"loss": 1.1158, "grad_norm": 0.375756174325943, "learning_rate": 0.0002, "epoch": 0.60207991242474, "step": 550}, {"loss": 1.0986, "grad_norm": 0.2920822203159332, "learning_rate": 0.0002, "epoch": 0.6130268199233716, "step": 560}, {"loss": 1.1137, "grad_norm": 0.3567981421947479, "learning_rate": 0.0002, "epoch": 0.6239737274220033, "step": 570}, {"loss": 1.0756, "grad_norm": 0.28140270709991455, "learning_rate": 0.0002, "epoch": 0.6349206349206349, "step": 580}, {"loss": 1.1183, "grad_norm": 0.335514098405838, "learning_rate": 0.0002, "epoch": 0.6458675424192666, "step": 590}, {"loss": 1.1349, "grad_norm": 1.211463212966919, "learning_rate": 0.0002, "epoch": 0.6568144499178982, "step": 600}, {"loss": 1.0611, "grad_norm": 0.26986172795295715, "learning_rate": 0.0002, "epoch": 0.6677613574165299, "step": 610}, {"loss": 1.2121, "grad_norm": 0.3156491816043854, "learning_rate": 0.0002, "epoch": 0.6787082649151615, "step": 620}, {"loss": 1.2252, "grad_norm": 0.3593895435333252, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 630}, {"loss": 1.2449, "grad_norm": 0.29723769426345825, "learning_rate": 0.0002, "epoch": 0.7006020799124247, "step": 640}, {"loss": 1.0915, "grad_norm": 0.37067872285842896, "learning_rate": 0.0002, "epoch": 0.7115489874110563, "step": 650}, {"loss": 1.1409, "grad_norm": 0.4170680344104767, "learning_rate": 0.0002, "epoch": 0.722495894909688, "step": 660}, {"loss": 1.1095, "grad_norm": 0.6047097444534302, "learning_rate": 0.0002, "epoch": 0.7334428024083196, "step": 670}, {"loss": 1.2664, "grad_norm": 0.3834376037120819, "learning_rate": 0.0002, "epoch": 0.7443897099069513, "step": 680}, {"loss": 1.0855, "grad_norm": 0.2889885902404785, "learning_rate": 0.0002, "epoch": 0.7553366174055829, "step": 690}, {"loss": 1.205, "grad_norm": 0.32053008675575256, "learning_rate": 0.0002, "epoch": 0.7662835249042146, "step": 700}, {"loss": 0.9979, "grad_norm": 0.2932562828063965, "learning_rate": 0.0002, "epoch": 0.7772304324028462, "step": 710}, {"loss": 1.2304, "grad_norm": 0.6452378034591675, "learning_rate": 0.0002, "epoch": 0.7881773399014779, "step": 720}, {"loss": 1.2512, "grad_norm": 0.32636845111846924, "learning_rate": 0.0002, "epoch": 0.7991242474001095, "step": 730}, {"loss": 1.0591, "grad_norm": 0.544005274772644, "learning_rate": 0.0002, "epoch": 0.8100711548987412, "step": 740}, {"loss": 1.2528, "grad_norm": 0.4918079972267151, "learning_rate": 0.0002, "epoch": 0.8210180623973727, "step": 750}, {"loss": 1.1599, "grad_norm": 0.3194459080696106, "learning_rate": 0.0002, "epoch": 0.8319649698960043, "step": 760}, {"loss": 1.2657, "grad_norm": 0.5542969107627869, "learning_rate": 0.0002, "epoch": 0.842911877394636, "step": 770}, {"loss": 1.0955, "grad_norm": 0.368278443813324, "learning_rate": 0.0002, "epoch": 0.8538587848932676, "step": 780}, {"loss": 1.1052, "grad_norm": 0.287670373916626, "learning_rate": 0.0002, "epoch": 0.8648056923918993, "step": 790}, {"loss": 1.1203, "grad_norm": 0.3100526034832001, "learning_rate": 0.0002, "epoch": 0.8757525998905309, "step": 800}, {"loss": 1.1008, "grad_norm": 0.25668758153915405, "learning_rate": 0.0002, "epoch": 0.8866995073891626, "step": 810}, {"loss": 1.1867, "grad_norm": 0.3156385123729706, "learning_rate": 0.0002, "epoch": 0.8976464148877942, "step": 820}, {"loss": 1.0224, "grad_norm": 0.33307573199272156, "learning_rate": 0.0002, "epoch": 0.9085933223864259, "step": 830}, {"loss": 0.9724, "grad_norm": 0.22630105912685394, "learning_rate": 0.0002, "epoch": 0.9195402298850575, "step": 840}, {"loss": 1.0674, "grad_norm": 0.3279208242893219, "learning_rate": 0.0002, "epoch": 0.9304871373836892, "step": 850}, {"loss": 1.0391, "grad_norm": 0.5596337914466858, "learning_rate": 0.0002, "epoch": 0.9414340448823207, "step": 860}, {"loss": 1.0405, "grad_norm": 0.32892531156539917, "learning_rate": 0.0002, "epoch": 0.9523809523809523, "step": 870}, {"loss": 1.2733, "grad_norm": 0.4221343696117401, "learning_rate": 0.0002, "epoch": 0.963327859879584, "step": 880}, {"loss": 1.1131, "grad_norm": 0.2666907012462616, "learning_rate": 0.0002, "epoch": 0.9742747673782156, "step": 890}, {"loss": 1.0692, "grad_norm": 0.2804788053035736, "learning_rate": 0.0002, "epoch": 0.9852216748768473, "step": 900}, {"loss": 1.0977, "grad_norm": 0.3064863085746765, "learning_rate": 0.0002, "epoch": 0.9961685823754789, "step": 910}]} +{"epoch": 0.9994526546250684, "step": 913, "epoch_duration": 1366.6045315265656, "total_accumulated_duration": 1366.6045315265656, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4039, "grad_norm": 0.7611560821533203, "learning_rate": 0.0002, "epoch": 0.010946907498631636, "step": 10}, {"loss": 1.8818, "grad_norm": 0.4633193612098694, "learning_rate": 0.0002, "epoch": 0.021893814997263273, "step": 20}, {"loss": 1.5927, "grad_norm": 0.49326154589653015, "learning_rate": 0.0002, "epoch": 0.03284072249589491, "step": 30}, {"loss": 1.3859, "grad_norm": 0.3943138122558594, "learning_rate": 0.0002, "epoch": 0.043787629994526546, "step": 40}, {"loss": 1.3036, "grad_norm": 0.43292930722236633, "learning_rate": 0.0002, "epoch": 0.05473453749315818, "step": 50}, {"loss": 1.2427, "grad_norm": 0.3431817591190338, "learning_rate": 0.0002, "epoch": 0.06568144499178982, "step": 60}, {"loss": 1.3076, "grad_norm": 0.38011446595191956, "learning_rate": 0.0002, "epoch": 0.07662835249042145, "step": 70}, {"loss": 1.1641, "grad_norm": 0.7441071271896362, "learning_rate": 0.0002, "epoch": 0.08757525998905309, "step": 80}, {"loss": 1.1721, "grad_norm": 0.3359833061695099, "learning_rate": 0.0002, "epoch": 0.09852216748768473, "step": 90}, {"loss": 1.2774, "grad_norm": 0.3724392354488373, "learning_rate": 0.0002, "epoch": 0.10946907498631636, "step": 100}, {"loss": 1.216, "grad_norm": 0.40673762559890747, "learning_rate": 0.0002, "epoch": 0.120415982484948, "step": 110}, {"loss": 1.2611, "grad_norm": 0.40036800503730774, "learning_rate": 0.0002, "epoch": 0.13136288998357964, "step": 120}, {"loss": 1.2436, "grad_norm": 2.844191312789917, "learning_rate": 0.0002, "epoch": 0.1423097974822113, "step": 130}, {"loss": 1.2254, "grad_norm": 0.3104734420776367, "learning_rate": 0.0002, "epoch": 0.1532567049808429, "step": 140}, {"loss": 1.0498, "grad_norm": 0.3266797959804535, "learning_rate": 0.0002, "epoch": 0.16420361247947454, "step": 150}, {"loss": 1.1644, "grad_norm": 0.3079199194908142, "learning_rate": 0.0002, "epoch": 0.17515051997810618, "step": 160}, {"loss": 1.2923, "grad_norm": 0.3872479498386383, "learning_rate": 0.0002, "epoch": 0.18609742747673783, "step": 170}, {"loss": 1.1809, "grad_norm": 0.38654500246047974, "learning_rate": 0.0002, "epoch": 0.19704433497536947, "step": 180}, {"loss": 1.066, "grad_norm": 0.2913552522659302, "learning_rate": 0.0002, "epoch": 0.20799124247400108, "step": 190}, {"loss": 1.0868, "grad_norm": 0.2960572838783264, "learning_rate": 0.0002, "epoch": 0.21893814997263272, "step": 200}, {"loss": 1.136, "grad_norm": 0.5175501108169556, "learning_rate": 0.0002, "epoch": 0.22988505747126436, "step": 210}, {"loss": 1.1556, "grad_norm": 1.2921574115753174, "learning_rate": 0.0002, "epoch": 0.240831964969896, "step": 220}, {"loss": 1.1488, "grad_norm": 0.2675512135028839, "learning_rate": 0.0002, "epoch": 0.25177887246852765, "step": 230}, {"loss": 1.2764, "grad_norm": 0.3956190049648285, "learning_rate": 0.0002, "epoch": 0.2627257799671593, "step": 240}, {"loss": 1.1889, "grad_norm": 0.6022581458091736, "learning_rate": 0.0002, "epoch": 0.27367268746579093, "step": 250}, {"loss": 1.1981, "grad_norm": 1.1949563026428223, "learning_rate": 0.0002, "epoch": 0.2846195949644226, "step": 260}, {"loss": 1.1877, "grad_norm": 0.31173548102378845, "learning_rate": 0.0002, "epoch": 0.2955665024630542, "step": 270}, {"loss": 1.06, "grad_norm": 0.2808472812175751, "learning_rate": 0.0002, "epoch": 0.3065134099616858, "step": 280}, {"loss": 1.1752, "grad_norm": 0.3042023777961731, "learning_rate": 0.0002, "epoch": 0.31746031746031744, "step": 290}, {"loss": 1.3147, "grad_norm": 0.39915043115615845, "learning_rate": 0.0002, "epoch": 0.3284072249589491, "step": 300}, {"loss": 1.2425, "grad_norm": 0.39118197560310364, "learning_rate": 0.0002, "epoch": 0.3393541324575807, "step": 310}, {"loss": 1.1363, "grad_norm": 0.355010986328125, "learning_rate": 0.0002, "epoch": 0.35030103995621237, "step": 320}, {"loss": 1.1925, "grad_norm": 0.29734086990356445, "learning_rate": 0.0002, "epoch": 0.361247947454844, "step": 330}, {"loss": 1.1974, "grad_norm": 0.346096009016037, "learning_rate": 0.0002, "epoch": 0.37219485495347565, "step": 340}, {"loss": 1.1641, "grad_norm": 0.4829643666744232, "learning_rate": 0.0002, "epoch": 0.3831417624521073, "step": 350}, {"loss": 1.2808, "grad_norm": 0.4726872742176056, "learning_rate": 0.0002, "epoch": 0.39408866995073893, "step": 360}, {"loss": 1.1532, "grad_norm": 0.3130153715610504, "learning_rate": 0.0002, "epoch": 0.4050355774493706, "step": 370}, {"loss": 1.1842, "grad_norm": 0.5123590230941772, "learning_rate": 0.0002, "epoch": 0.41598248494800216, "step": 380}, {"loss": 1.1539, "grad_norm": 0.3444574773311615, "learning_rate": 0.0002, "epoch": 0.4269293924466338, "step": 390}, {"loss": 1.1756, "grad_norm": 0.5302175283432007, "learning_rate": 0.0002, "epoch": 0.43787629994526545, "step": 400}, {"loss": 1.1138, "grad_norm": 0.2713572680950165, "learning_rate": 0.0002, "epoch": 0.4488232074438971, "step": 410}, {"loss": 1.1281, "grad_norm": 0.29530611634254456, "learning_rate": 0.0002, "epoch": 0.45977011494252873, "step": 420}, {"loss": 1.1721, "grad_norm": 0.27282455563545227, "learning_rate": 0.0002, "epoch": 0.47071702244116037, "step": 430}, {"loss": 1.1213, "grad_norm": 0.2647949755191803, "learning_rate": 0.0002, "epoch": 0.481663929939792, "step": 440}, {"loss": 1.1656, "grad_norm": 0.35509347915649414, "learning_rate": 0.0002, "epoch": 0.49261083743842365, "step": 450}, {"loss": 1.1251, "grad_norm": 0.1959609091281891, "learning_rate": 0.0002, "epoch": 0.5035577449370553, "step": 460}, {"loss": 1.1889, "grad_norm": 0.40090155601501465, "learning_rate": 0.0002, "epoch": 0.5145046524356869, "step": 470}, {"loss": 1.156, "grad_norm": 0.3354604244232178, "learning_rate": 0.0002, "epoch": 0.5254515599343186, "step": 480}, {"loss": 1.1024, "grad_norm": 0.2758506238460541, "learning_rate": 0.0002, "epoch": 0.5363984674329502, "step": 490}, {"loss": 1.1108, "grad_norm": 0.3629051744937897, "learning_rate": 0.0002, "epoch": 0.5473453749315819, "step": 500}, {"loss": 1.2236, "grad_norm": 0.30802229046821594, "learning_rate": 0.0002, "epoch": 0.5582922824302134, "step": 510}, {"loss": 1.0424, "grad_norm": 0.3099463880062103, "learning_rate": 0.0002, "epoch": 0.5692391899288451, "step": 520}, {"loss": 1.255, "grad_norm": 0.42299067974090576, "learning_rate": 0.0002, "epoch": 0.5801860974274767, "step": 530}, {"loss": 1.1698, "grad_norm": 0.5392252802848816, "learning_rate": 0.0002, "epoch": 0.5911330049261084, "step": 540}, {"loss": 1.1171, "grad_norm": 0.34768250584602356, "learning_rate": 0.0002, "epoch": 0.60207991242474, "step": 550}, {"loss": 1.102, "grad_norm": 0.28490015864372253, "learning_rate": 0.0002, "epoch": 0.6130268199233716, "step": 560}, {"loss": 1.1111, "grad_norm": 0.34787994623184204, "learning_rate": 0.0002, "epoch": 0.6239737274220033, "step": 570}, {"loss": 1.0759, "grad_norm": 0.29058772325515747, "learning_rate": 0.0002, "epoch": 0.6349206349206349, "step": 580}, {"loss": 1.1157, "grad_norm": 0.4063778817653656, "learning_rate": 0.0002, "epoch": 0.6458675424192666, "step": 590}, {"loss": 1.1432, "grad_norm": 0.9244267344474792, "learning_rate": 0.0002, "epoch": 0.6568144499178982, "step": 600}, {"loss": 1.0591, "grad_norm": 0.27605190873146057, "learning_rate": 0.0002, "epoch": 0.6677613574165299, "step": 610}, {"loss": 1.2123, "grad_norm": 0.34346821904182434, "learning_rate": 0.0002, "epoch": 0.6787082649151615, "step": 620}, {"loss": 1.2195, "grad_norm": 0.3093279302120209, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 630}, {"loss": 1.2461, "grad_norm": 1.0069009065628052, "learning_rate": 0.0002, "epoch": 0.7006020799124247, "step": 640}, {"loss": 1.0892, "grad_norm": 0.5049130916595459, "learning_rate": 0.0002, "epoch": 0.7115489874110563, "step": 650}, {"loss": 1.1404, "grad_norm": 0.3748924732208252, "learning_rate": 0.0002, "epoch": 0.722495894909688, "step": 660}, {"loss": 1.1062, "grad_norm": 0.2964959144592285, "learning_rate": 0.0002, "epoch": 0.7334428024083196, "step": 670}, {"loss": 1.2617, "grad_norm": 0.4599986970424652, "learning_rate": 0.0002, "epoch": 0.7443897099069513, "step": 680}, {"loss": 1.088, "grad_norm": 0.27292951941490173, "learning_rate": 0.0002, "epoch": 0.7553366174055829, "step": 690}, {"loss": 1.2047, "grad_norm": 0.3123566806316376, "learning_rate": 0.0002, "epoch": 0.7662835249042146, "step": 700}, {"loss": 1.0021, "grad_norm": 0.28310710191726685, "learning_rate": 0.0002, "epoch": 0.7772304324028462, "step": 710}, {"loss": 1.2281, "grad_norm": 0.3279992341995239, "learning_rate": 0.0002, "epoch": 0.7881773399014779, "step": 720}, {"loss": 1.25, "grad_norm": 0.28179168701171875, "learning_rate": 0.0002, "epoch": 0.7991242474001095, "step": 730}, {"loss": 1.0602, "grad_norm": 0.31492987275123596, "learning_rate": 0.0002, "epoch": 0.8100711548987412, "step": 740}, {"loss": 1.2518, "grad_norm": 0.41821011900901794, "learning_rate": 0.0002, "epoch": 0.8210180623973727, "step": 750}, {"loss": 1.1612, "grad_norm": 0.325235515832901, "learning_rate": 0.0002, "epoch": 0.8319649698960043, "step": 760}, {"loss": 1.27, "grad_norm": 0.5366070866584778, "learning_rate": 0.0002, "epoch": 0.842911877394636, "step": 770}, {"loss": 1.0921, "grad_norm": 0.32570579648017883, "learning_rate": 0.0002, "epoch": 0.8538587848932676, "step": 780}, {"loss": 1.1032, "grad_norm": 0.3642968237400055, "learning_rate": 0.0002, "epoch": 0.8648056923918993, "step": 790}, {"loss": 1.1234, "grad_norm": 0.29713448882102966, "learning_rate": 0.0002, "epoch": 0.8757525998905309, "step": 800}, {"loss": 1.0978, "grad_norm": 0.23599444329738617, "learning_rate": 0.0002, "epoch": 0.8866995073891626, "step": 810}, {"loss": 1.1867, "grad_norm": 0.31522464752197266, "learning_rate": 0.0002, "epoch": 0.8976464148877942, "step": 820}, {"loss": 1.0208, "grad_norm": 0.32754790782928467, "learning_rate": 0.0002, "epoch": 0.9085933223864259, "step": 830}, {"loss": 0.9786, "grad_norm": 0.22741089761257172, "learning_rate": 0.0002, "epoch": 0.9195402298850575, "step": 840}, {"loss": 1.0689, "grad_norm": 0.3089679777622223, "learning_rate": 0.0002, "epoch": 0.9304871373836892, "step": 850}, {"loss": 1.0354, "grad_norm": 0.27440521121025085, "learning_rate": 0.0002, "epoch": 0.9414340448823207, "step": 860}, {"loss": 1.0417, "grad_norm": 0.3498363792896271, "learning_rate": 0.0002, "epoch": 0.9523809523809523, "step": 870}, {"loss": 1.269, "grad_norm": 0.47151854634284973, "learning_rate": 0.0002, "epoch": 0.963327859879584, "step": 880}, {"loss": 1.1174, "grad_norm": 0.24756591022014618, "learning_rate": 0.0002, "epoch": 0.9742747673782156, "step": 890}, {"loss": 1.0622, "grad_norm": 0.2600938677787781, "learning_rate": 0.0002, "epoch": 0.9852216748768473, "step": 900}, {"loss": 1.0968, "grad_norm": 0.2934586703777313, "learning_rate": 0.0002, "epoch": 0.9961685823754789, "step": 910}]} +{"epoch": 2.0, "step": 1827, "epoch_duration": 1362.2936389446259, "total_accumulated_duration": 2728.8981704711914, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-913", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4039, "grad_norm": 0.7611560821533203, "learning_rate": 0.0002, "epoch": 0.010946907498631636, "step": 10}, {"loss": 1.8818, "grad_norm": 0.4633193612098694, "learning_rate": 0.0002, "epoch": 0.021893814997263273, "step": 20}, {"loss": 1.5927, "grad_norm": 0.49326154589653015, "learning_rate": 0.0002, "epoch": 0.03284072249589491, "step": 30}, {"loss": 1.3859, "grad_norm": 0.3943138122558594, "learning_rate": 0.0002, "epoch": 0.043787629994526546, "step": 40}, {"loss": 1.3036, "grad_norm": 0.43292930722236633, "learning_rate": 0.0002, "epoch": 0.05473453749315818, "step": 50}, {"loss": 1.2427, "grad_norm": 0.3431817591190338, "learning_rate": 0.0002, "epoch": 0.06568144499178982, "step": 60}, {"loss": 1.3076, "grad_norm": 0.38011446595191956, "learning_rate": 0.0002, "epoch": 0.07662835249042145, "step": 70}, {"loss": 1.1641, "grad_norm": 0.7441071271896362, "learning_rate": 0.0002, "epoch": 0.08757525998905309, "step": 80}, {"loss": 1.1721, "grad_norm": 0.3359833061695099, "learning_rate": 0.0002, "epoch": 0.09852216748768473, "step": 90}, {"loss": 1.2774, "grad_norm": 0.3724392354488373, "learning_rate": 0.0002, "epoch": 0.10946907498631636, "step": 100}, {"loss": 1.216, "grad_norm": 0.40673762559890747, "learning_rate": 0.0002, "epoch": 0.120415982484948, "step": 110}, {"loss": 1.2611, "grad_norm": 0.40036800503730774, "learning_rate": 0.0002, "epoch": 0.13136288998357964, "step": 120}, {"loss": 1.2436, "grad_norm": 2.844191312789917, "learning_rate": 0.0002, "epoch": 0.1423097974822113, "step": 130}, {"loss": 1.2254, "grad_norm": 0.3104734420776367, "learning_rate": 0.0002, "epoch": 0.1532567049808429, "step": 140}, {"loss": 1.0498, "grad_norm": 0.3266797959804535, "learning_rate": 0.0002, "epoch": 0.16420361247947454, "step": 150}, {"loss": 1.1644, "grad_norm": 0.3079199194908142, "learning_rate": 0.0002, "epoch": 0.17515051997810618, "step": 160}, {"loss": 1.2923, "grad_norm": 0.3872479498386383, "learning_rate": 0.0002, "epoch": 0.18609742747673783, "step": 170}, {"loss": 1.1809, "grad_norm": 0.38654500246047974, "learning_rate": 0.0002, "epoch": 0.19704433497536947, "step": 180}, {"loss": 1.066, "grad_norm": 0.2913552522659302, "learning_rate": 0.0002, "epoch": 0.20799124247400108, "step": 190}, {"loss": 1.0868, "grad_norm": 0.2960572838783264, "learning_rate": 0.0002, "epoch": 0.21893814997263272, "step": 200}, {"loss": 1.136, "grad_norm": 0.5175501108169556, "learning_rate": 0.0002, "epoch": 0.22988505747126436, "step": 210}, {"loss": 1.1556, "grad_norm": 1.2921574115753174, "learning_rate": 0.0002, "epoch": 0.240831964969896, "step": 220}, {"loss": 1.1488, "grad_norm": 0.2675512135028839, "learning_rate": 0.0002, "epoch": 0.25177887246852765, "step": 230}, {"loss": 1.2764, "grad_norm": 0.3956190049648285, "learning_rate": 0.0002, "epoch": 0.2627257799671593, "step": 240}, {"loss": 1.1889, "grad_norm": 0.6022581458091736, "learning_rate": 0.0002, "epoch": 0.27367268746579093, "step": 250}, {"loss": 1.1981, "grad_norm": 1.1949563026428223, "learning_rate": 0.0002, "epoch": 0.2846195949644226, "step": 260}, {"loss": 1.1877, "grad_norm": 0.31173548102378845, "learning_rate": 0.0002, "epoch": 0.2955665024630542, "step": 270}, {"loss": 1.06, "grad_norm": 0.2808472812175751, "learning_rate": 0.0002, "epoch": 0.3065134099616858, "step": 280}, {"loss": 1.1752, "grad_norm": 0.3042023777961731, "learning_rate": 0.0002, "epoch": 0.31746031746031744, "step": 290}, {"loss": 1.3147, "grad_norm": 0.39915043115615845, "learning_rate": 0.0002, "epoch": 0.3284072249589491, "step": 300}, {"loss": 1.2425, "grad_norm": 0.39118197560310364, "learning_rate": 0.0002, "epoch": 0.3393541324575807, "step": 310}, {"loss": 1.1363, "grad_norm": 0.355010986328125, "learning_rate": 0.0002, "epoch": 0.35030103995621237, "step": 320}, {"loss": 1.1925, "grad_norm": 0.29734086990356445, "learning_rate": 0.0002, "epoch": 0.361247947454844, "step": 330}, {"loss": 1.1974, "grad_norm": 0.346096009016037, "learning_rate": 0.0002, "epoch": 0.37219485495347565, "step": 340}, {"loss": 1.1641, "grad_norm": 0.4829643666744232, "learning_rate": 0.0002, "epoch": 0.3831417624521073, "step": 350}, {"loss": 1.2808, "grad_norm": 0.4726872742176056, "learning_rate": 0.0002, "epoch": 0.39408866995073893, "step": 360}, {"loss": 1.1532, "grad_norm": 0.3130153715610504, "learning_rate": 0.0002, "epoch": 0.4050355774493706, "step": 370}, {"loss": 1.1842, "grad_norm": 0.5123590230941772, "learning_rate": 0.0002, "epoch": 0.41598248494800216, "step": 380}, {"loss": 1.1539, "grad_norm": 0.3444574773311615, "learning_rate": 0.0002, "epoch": 0.4269293924466338, "step": 390}, {"loss": 1.1756, "grad_norm": 0.5302175283432007, "learning_rate": 0.0002, "epoch": 0.43787629994526545, "step": 400}, {"loss": 1.1138, "grad_norm": 0.2713572680950165, "learning_rate": 0.0002, "epoch": 0.4488232074438971, "step": 410}, {"loss": 1.1281, "grad_norm": 0.29530611634254456, "learning_rate": 0.0002, "epoch": 0.45977011494252873, "step": 420}, {"loss": 1.1721, "grad_norm": 0.27282455563545227, "learning_rate": 0.0002, "epoch": 0.47071702244116037, "step": 430}, {"loss": 1.1213, "grad_norm": 0.2647949755191803, "learning_rate": 0.0002, "epoch": 0.481663929939792, "step": 440}, {"loss": 1.1656, "grad_norm": 0.35509347915649414, "learning_rate": 0.0002, "epoch": 0.49261083743842365, "step": 450}, {"loss": 1.1251, "grad_norm": 0.1959609091281891, "learning_rate": 0.0002, "epoch": 0.5035577449370553, "step": 460}, {"loss": 1.1889, "grad_norm": 0.40090155601501465, "learning_rate": 0.0002, "epoch": 0.5145046524356869, "step": 470}, {"loss": 1.156, "grad_norm": 0.3354604244232178, "learning_rate": 0.0002, "epoch": 0.5254515599343186, "step": 480}, {"loss": 1.1024, "grad_norm": 0.2758506238460541, "learning_rate": 0.0002, "epoch": 0.5363984674329502, "step": 490}, {"loss": 1.1108, "grad_norm": 0.3629051744937897, "learning_rate": 0.0002, "epoch": 0.5473453749315819, "step": 500}, {"loss": 1.2236, "grad_norm": 0.30802229046821594, "learning_rate": 0.0002, "epoch": 0.5582922824302134, "step": 510}, {"loss": 1.0424, "grad_norm": 0.3099463880062103, "learning_rate": 0.0002, "epoch": 0.5692391899288451, "step": 520}, {"loss": 1.255, "grad_norm": 0.42299067974090576, "learning_rate": 0.0002, "epoch": 0.5801860974274767, "step": 530}, {"loss": 1.1698, "grad_norm": 0.5392252802848816, "learning_rate": 0.0002, "epoch": 0.5911330049261084, "step": 540}, {"loss": 1.1171, "grad_norm": 0.34768250584602356, "learning_rate": 0.0002, "epoch": 0.60207991242474, "step": 550}, {"loss": 1.102, "grad_norm": 0.28490015864372253, "learning_rate": 0.0002, "epoch": 0.6130268199233716, "step": 560}, {"loss": 1.1111, "grad_norm": 0.34787994623184204, "learning_rate": 0.0002, "epoch": 0.6239737274220033, "step": 570}, {"loss": 1.0759, "grad_norm": 0.29058772325515747, "learning_rate": 0.0002, "epoch": 0.6349206349206349, "step": 580}, {"loss": 1.1157, "grad_norm": 0.4063778817653656, "learning_rate": 0.0002, "epoch": 0.6458675424192666, "step": 590}, {"loss": 1.1432, "grad_norm": 0.9244267344474792, "learning_rate": 0.0002, "epoch": 0.6568144499178982, "step": 600}, {"loss": 1.0591, "grad_norm": 0.27605190873146057, "learning_rate": 0.0002, "epoch": 0.6677613574165299, "step": 610}, {"loss": 1.2123, "grad_norm": 0.34346821904182434, "learning_rate": 0.0002, "epoch": 0.6787082649151615, "step": 620}, {"loss": 1.2195, "grad_norm": 0.3093279302120209, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 630}, {"loss": 1.2461, "grad_norm": 1.0069009065628052, "learning_rate": 0.0002, "epoch": 0.7006020799124247, "step": 640}, {"loss": 1.0892, "grad_norm": 0.5049130916595459, "learning_rate": 0.0002, "epoch": 0.7115489874110563, "step": 650}, {"loss": 1.1404, "grad_norm": 0.3748924732208252, "learning_rate": 0.0002, "epoch": 0.722495894909688, "step": 660}, {"loss": 1.1062, "grad_norm": 0.2964959144592285, "learning_rate": 0.0002, "epoch": 0.7334428024083196, "step": 670}, {"loss": 1.2617, "grad_norm": 0.4599986970424652, "learning_rate": 0.0002, "epoch": 0.7443897099069513, "step": 680}, {"loss": 1.088, "grad_norm": 0.27292951941490173, "learning_rate": 0.0002, "epoch": 0.7553366174055829, "step": 690}, {"loss": 1.2047, "grad_norm": 0.3123566806316376, "learning_rate": 0.0002, "epoch": 0.7662835249042146, "step": 700}, {"loss": 1.0021, "grad_norm": 0.28310710191726685, "learning_rate": 0.0002, "epoch": 0.7772304324028462, "step": 710}, {"loss": 1.2281, "grad_norm": 0.3279992341995239, "learning_rate": 0.0002, "epoch": 0.7881773399014779, "step": 720}, {"loss": 1.25, "grad_norm": 0.28179168701171875, "learning_rate": 0.0002, "epoch": 0.7991242474001095, "step": 730}, {"loss": 1.0602, "grad_norm": 0.31492987275123596, "learning_rate": 0.0002, "epoch": 0.8100711548987412, "step": 740}, {"loss": 1.2518, "grad_norm": 0.41821011900901794, "learning_rate": 0.0002, "epoch": 0.8210180623973727, "step": 750}, {"loss": 1.1612, "grad_norm": 0.325235515832901, "learning_rate": 0.0002, "epoch": 0.8319649698960043, "step": 760}, {"loss": 1.27, "grad_norm": 0.5366070866584778, "learning_rate": 0.0002, "epoch": 0.842911877394636, "step": 770}, {"loss": 1.0921, "grad_norm": 0.32570579648017883, "learning_rate": 0.0002, "epoch": 0.8538587848932676, "step": 780}, {"loss": 1.1032, "grad_norm": 0.3642968237400055, "learning_rate": 0.0002, "epoch": 0.8648056923918993, "step": 790}, {"loss": 1.1234, "grad_norm": 0.29713448882102966, "learning_rate": 0.0002, "epoch": 0.8757525998905309, "step": 800}, {"loss": 1.0978, "grad_norm": 0.23599444329738617, "learning_rate": 0.0002, "epoch": 0.8866995073891626, "step": 810}, {"loss": 1.1867, "grad_norm": 0.31522464752197266, "learning_rate": 0.0002, "epoch": 0.8976464148877942, "step": 820}, {"loss": 1.0208, "grad_norm": 0.32754790782928467, "learning_rate": 0.0002, "epoch": 0.9085933223864259, "step": 830}, {"loss": 0.9786, "grad_norm": 0.22741089761257172, "learning_rate": 0.0002, "epoch": 0.9195402298850575, "step": 840}, {"loss": 1.0689, "grad_norm": 0.3089679777622223, "learning_rate": 0.0002, "epoch": 0.9304871373836892, "step": 850}, {"loss": 1.0354, "grad_norm": 0.27440521121025085, "learning_rate": 0.0002, "epoch": 0.9414340448823207, "step": 860}, {"loss": 1.0417, "grad_norm": 0.3498363792896271, "learning_rate": 0.0002, "epoch": 0.9523809523809523, "step": 870}, {"loss": 1.269, "grad_norm": 0.47151854634284973, "learning_rate": 0.0002, "epoch": 0.963327859879584, "step": 880}, {"loss": 1.1174, "grad_norm": 0.24756591022014618, "learning_rate": 0.0002, "epoch": 0.9742747673782156, "step": 890}, {"loss": 1.0622, "grad_norm": 0.2600938677787781, "learning_rate": 0.0002, "epoch": 0.9852216748768473, "step": 900}, {"loss": 1.0968, "grad_norm": 0.2934586703777313, "learning_rate": 0.0002, "epoch": 0.9961685823754789, "step": 910}, {"eval_loss": 1.158464789390564, "eval_runtime": 46.0774, "eval_samples_per_second": 9.462, "eval_steps_per_second": 1.194, "epoch": 0.9994526546250684, "step": 913}, {"loss": 1.0322, "grad_norm": 0.37776654958724976, "learning_rate": 0.0002, "epoch": 1.0071154898741106, "step": 920}, {"loss": 1.1346, "grad_norm": 0.31784629821777344, "learning_rate": 0.0002, "epoch": 1.0180623973727423, "step": 930}, {"loss": 0.8976, "grad_norm": 0.24244336783885956, "learning_rate": 0.0002, "epoch": 1.0290093048713738, "step": 940}, {"loss": 1.0172, "grad_norm": 0.3185454308986664, "learning_rate": 0.0002, "epoch": 1.0399562123700055, "step": 950}, {"loss": 1.1645, "grad_norm": 0.3589441478252411, "learning_rate": 0.0002, "epoch": 1.0509031198686372, "step": 960}, {"loss": 1.0317, "grad_norm": 0.38593578338623047, "learning_rate": 0.0002, "epoch": 1.0618500273672686, "step": 970}, {"loss": 0.9913, "grad_norm": 0.39694955945014954, "learning_rate": 0.0002, "epoch": 1.0727969348659003, "step": 980}, {"loss": 1.0893, "grad_norm": 0.469817191362381, "learning_rate": 0.0002, "epoch": 1.083743842364532, "step": 990}, {"loss": 0.983, "grad_norm": 0.2634755074977875, "learning_rate": 0.0002, "epoch": 1.0946907498631637, "step": 1000}, {"loss": 1.0144, "grad_norm": 0.43189436197280884, "learning_rate": 0.0002, "epoch": 1.1056376573617952, "step": 1010}, {"loss": 0.9663, "grad_norm": 0.5559977889060974, "learning_rate": 0.0002, "epoch": 1.116584564860427, "step": 1020}, {"loss": 1.0481, "grad_norm": 0.32100191712379456, "learning_rate": 0.0002, "epoch": 1.1275314723590586, "step": 1030}, {"loss": 1.1012, "grad_norm": 0.40179768204689026, "learning_rate": 0.0002, "epoch": 1.1384783798576903, "step": 1040}, {"loss": 1.0029, "grad_norm": 0.3659493029117584, "learning_rate": 0.0002, "epoch": 1.1494252873563218, "step": 1050}, {"loss": 0.9597, "grad_norm": 0.701704204082489, "learning_rate": 0.0002, "epoch": 1.1603721948549535, "step": 1060}, {"loss": 1.0204, "grad_norm": 0.3650563359260559, "learning_rate": 0.0002, "epoch": 1.1713191023535852, "step": 1070}, {"loss": 0.907, "grad_norm": 0.3191976249217987, "learning_rate": 0.0002, "epoch": 1.1822660098522166, "step": 1080}, {"loss": 1.0648, "grad_norm": 0.3615441918373108, "learning_rate": 0.0002, "epoch": 1.1932129173508483, "step": 1090}, {"loss": 1.0067, "grad_norm": 0.39474231004714966, "learning_rate": 0.0002, "epoch": 1.20415982484948, "step": 1100}, {"loss": 0.9852, "grad_norm": 0.3752822279930115, "learning_rate": 0.0002, "epoch": 1.2151067323481117, "step": 1110}, {"loss": 1.0067, "grad_norm": 0.4165991246700287, "learning_rate": 0.0002, "epoch": 1.2260536398467432, "step": 1120}, {"loss": 1.0244, "grad_norm": 0.5326506495475769, "learning_rate": 0.0002, "epoch": 1.237000547345375, "step": 1130}, {"loss": 1.0542, "grad_norm": 0.48845794796943665, "learning_rate": 0.0002, "epoch": 1.2479474548440066, "step": 1140}, {"loss": 1.0885, "grad_norm": 0.29910150170326233, "learning_rate": 0.0002, "epoch": 1.2588943623426383, "step": 1150}, {"loss": 1.2233, "grad_norm": 0.5069725513458252, "learning_rate": 0.0002, "epoch": 1.2698412698412698, "step": 1160}, {"loss": 1.0992, "grad_norm": 0.29500406980514526, "learning_rate": 0.0002, "epoch": 1.2807881773399015, "step": 1170}, {"loss": 1.0291, "grad_norm": 0.4711538851261139, "learning_rate": 0.0002, "epoch": 1.2917350848385332, "step": 1180}, {"loss": 1.0513, "grad_norm": 0.4203340709209442, "learning_rate": 0.0002, "epoch": 1.3026819923371646, "step": 1190}, {"loss": 1.1274, "grad_norm": 0.36101874709129333, "learning_rate": 0.0002, "epoch": 1.3136288998357963, "step": 1200}, {"loss": 1.085, "grad_norm": 0.4608800411224365, "learning_rate": 0.0002, "epoch": 1.324575807334428, "step": 1210}, {"loss": 1.0695, "grad_norm": 0.6570906639099121, "learning_rate": 0.0002, "epoch": 1.3355227148330597, "step": 1220}, {"loss": 0.9786, "grad_norm": 0.5352164506912231, "learning_rate": 0.0002, "epoch": 1.3464696223316914, "step": 1230}, {"loss": 1.0513, "grad_norm": 0.3885001242160797, "learning_rate": 0.0002, "epoch": 1.357416529830323, "step": 1240}, {"loss": 1.0611, "grad_norm": 0.2987913489341736, "learning_rate": 0.0002, "epoch": 1.3683634373289546, "step": 1250}, {"loss": 1.109, "grad_norm": 0.42070427536964417, "learning_rate": 0.0002, "epoch": 1.3793103448275863, "step": 1260}, {"loss": 1.1092, "grad_norm": 0.5957782864570618, "learning_rate": 0.0002, "epoch": 1.3902572523262178, "step": 1270}, {"loss": 1.0554, "grad_norm": 0.32898882031440735, "learning_rate": 0.0002, "epoch": 1.4012041598248495, "step": 1280}, {"loss": 0.909, "grad_norm": 0.27624452114105225, "learning_rate": 0.0002, "epoch": 1.4121510673234812, "step": 1290}, {"loss": 1.024, "grad_norm": 0.49570828676223755, "learning_rate": 0.0002, "epoch": 1.4230979748221126, "step": 1300}, {"loss": 1.0471, "grad_norm": 0.26191383600234985, "learning_rate": 0.0002, "epoch": 1.4340448823207443, "step": 1310}, {"loss": 1.0788, "grad_norm": 0.35664042830467224, "learning_rate": 0.0002, "epoch": 1.444991789819376, "step": 1320}, {"loss": 1.0108, "grad_norm": 0.45126354694366455, "learning_rate": 0.0002, "epoch": 1.4559386973180077, "step": 1330}, {"loss": 0.9571, "grad_norm": 0.37318357825279236, "learning_rate": 0.0002, "epoch": 1.4668856048166394, "step": 1340}, {"loss": 1.0507, "grad_norm": 0.6428970098495483, "learning_rate": 0.0002, "epoch": 1.477832512315271, "step": 1350}, {"loss": 1.0367, "grad_norm": 0.43256187438964844, "learning_rate": 0.0002, "epoch": 1.4887794198139026, "step": 1360}, {"loss": 1.1321, "grad_norm": 0.5343793630599976, "learning_rate": 0.0002, "epoch": 1.4997263273125343, "step": 1370}, {"loss": 1.1054, "grad_norm": 0.315437376499176, "learning_rate": 0.0002, "epoch": 1.5106732348111658, "step": 1380}, {"loss": 0.8916, "grad_norm": 0.41561153531074524, "learning_rate": 0.0002, "epoch": 1.5216201423097975, "step": 1390}, {"loss": 1.0391, "grad_norm": 0.3201070725917816, "learning_rate": 0.0002, "epoch": 1.5325670498084292, "step": 1400}, {"loss": 1.14, "grad_norm": 0.505537211894989, "learning_rate": 0.0002, "epoch": 1.5435139573070606, "step": 1410}, {"loss": 1.0775, "grad_norm": 0.3747410178184509, "learning_rate": 0.0002, "epoch": 1.5544608648056923, "step": 1420}, {"loss": 1.1171, "grad_norm": 0.49385908246040344, "learning_rate": 0.0002, "epoch": 1.565407772304324, "step": 1430}, {"loss": 1.1182, "grad_norm": 0.49831628799438477, "learning_rate": 0.0002, "epoch": 1.5763546798029555, "step": 1440}, {"loss": 1.0079, "grad_norm": 0.372127890586853, "learning_rate": 0.0002, "epoch": 1.5873015873015874, "step": 1450}, {"loss": 0.9931, "grad_norm": 0.40070840716362, "learning_rate": 0.0002, "epoch": 1.598248494800219, "step": 1460}, {"loss": 0.8954, "grad_norm": 0.34907400608062744, "learning_rate": 0.0002, "epoch": 1.6091954022988506, "step": 1470}, {"loss": 0.9743, "grad_norm": 0.4632418751716614, "learning_rate": 0.0002, "epoch": 1.6201423097974823, "step": 1480}, {"loss": 1.0103, "grad_norm": 0.40164515376091003, "learning_rate": 0.0002, "epoch": 1.6310892172961138, "step": 1490}, {"loss": 0.9523, "grad_norm": 0.3214994966983795, "learning_rate": 0.0002, "epoch": 1.6420361247947455, "step": 1500}, {"loss": 1.0161, "grad_norm": 0.3727897107601166, "learning_rate": 0.0002, "epoch": 1.6529830322933772, "step": 1510}, {"loss": 1.0443, "grad_norm": 0.3817640542984009, "learning_rate": 0.0002, "epoch": 1.6639299397920086, "step": 1520}, {"loss": 1.0511, "grad_norm": 0.5592136979103088, "learning_rate": 0.0002, "epoch": 1.6748768472906403, "step": 1530}, {"loss": 0.9682, "grad_norm": 0.44636598229408264, "learning_rate": 0.0002, "epoch": 1.685823754789272, "step": 1540}, {"loss": 1.033, "grad_norm": 0.40441709756851196, "learning_rate": 0.0002, "epoch": 1.6967706622879035, "step": 1550}, {"loss": 1.0857, "grad_norm": 0.3243522644042969, "learning_rate": 0.0002, "epoch": 1.7077175697865354, "step": 1560}, {"loss": 0.9482, "grad_norm": 0.34277570247650146, "learning_rate": 0.0002, "epoch": 1.718664477285167, "step": 1570}, {"loss": 0.974, "grad_norm": 0.3279995024204254, "learning_rate": 0.0002, "epoch": 1.7296113847837986, "step": 1580}, {"loss": 0.9414, "grad_norm": 0.41968777775764465, "learning_rate": 0.0002, "epoch": 1.7405582922824303, "step": 1590}, {"loss": 0.9768, "grad_norm": 0.39464613795280457, "learning_rate": 0.0002, "epoch": 1.7515051997810618, "step": 1600}, {"loss": 1.0347, "grad_norm": 0.3839009404182434, "learning_rate": 0.0002, "epoch": 1.7624521072796935, "step": 1610}, {"loss": 0.9195, "grad_norm": 0.3250715434551239, "learning_rate": 0.0002, "epoch": 1.7733990147783252, "step": 1620}, {"loss": 1.0049, "grad_norm": 0.5166561007499695, "learning_rate": 0.0002, "epoch": 1.7843459222769567, "step": 1630}, {"loss": 1.0364, "grad_norm": 0.4115183353424072, "learning_rate": 0.0002, "epoch": 1.7952928297755884, "step": 1640}, {"loss": 1.0248, "grad_norm": 0.373780220746994, "learning_rate": 0.0002, "epoch": 1.80623973727422, "step": 1650}, {"loss": 1.0984, "grad_norm": 0.49697014689445496, "learning_rate": 0.0002, "epoch": 1.8171866447728515, "step": 1660}, {"loss": 1.0089, "grad_norm": 1.0308938026428223, "learning_rate": 0.0002, "epoch": 1.8281335522714834, "step": 1670}, {"loss": 1.0853, "grad_norm": 0.4851366877555847, "learning_rate": 0.0002, "epoch": 1.839080459770115, "step": 1680}, {"loss": 0.9533, "grad_norm": 0.3262481391429901, "learning_rate": 0.0002, "epoch": 1.8500273672687466, "step": 1690}, {"loss": 1.048, "grad_norm": 0.6904496550559998, "learning_rate": 0.0002, "epoch": 1.8609742747673783, "step": 1700}, {"loss": 1.0577, "grad_norm": 0.49789851903915405, "learning_rate": 0.0002, "epoch": 1.8719211822660098, "step": 1710}, {"loss": 1.0258, "grad_norm": 0.3035794198513031, "learning_rate": 0.0002, "epoch": 1.8828680897646415, "step": 1720}, {"loss": 0.9916, "grad_norm": 0.4588414430618286, "learning_rate": 0.0002, "epoch": 1.8938149972632732, "step": 1730}, {"loss": 0.9526, "grad_norm": 0.4313034117221832, "learning_rate": 0.0002, "epoch": 1.9047619047619047, "step": 1740}, {"loss": 1.0857, "grad_norm": 0.38562044501304626, "learning_rate": 0.0002, "epoch": 1.9157088122605364, "step": 1750}, {"loss": 0.9763, "grad_norm": 0.46947410702705383, "learning_rate": 0.0002, "epoch": 1.926655719759168, "step": 1760}, {"loss": 0.9967, "grad_norm": 0.3848404884338379, "learning_rate": 0.0002, "epoch": 1.9376026272577995, "step": 1770}, {"loss": 1.0474, "grad_norm": 0.30422744154930115, "learning_rate": 0.0002, "epoch": 1.9485495347564314, "step": 1780}, {"loss": 1.022, "grad_norm": 0.41100990772247314, "learning_rate": 0.0002, "epoch": 1.959496442255063, "step": 1790}, {"loss": 1.003, "grad_norm": 0.3492335081100464, "learning_rate": 0.0002, "epoch": 1.9704433497536946, "step": 1800}, {"loss": 0.9891, "grad_norm": 0.364577978849411, "learning_rate": 0.0002, "epoch": 1.9813902572523263, "step": 1810}, {"loss": 1.0699, "grad_norm": 0.4312075674533844, "learning_rate": 0.0002, "epoch": 1.9923371647509578, "step": 1820}]} +{"epoch": 2.9994526546250686, "step": 2740, "epoch_duration": 1360.6272654533386, "total_accumulated_duration": 4089.52543592453, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4039, "grad_norm": 0.7611560821533203, "learning_rate": 0.0002, "epoch": 0.010946907498631636, "step": 10}, {"loss": 1.8818, "grad_norm": 0.4633193612098694, "learning_rate": 0.0002, "epoch": 0.021893814997263273, "step": 20}, {"loss": 1.5927, "grad_norm": 0.49326154589653015, "learning_rate": 0.0002, "epoch": 0.03284072249589491, "step": 30}, {"loss": 1.3859, "grad_norm": 0.3943138122558594, "learning_rate": 0.0002, "epoch": 0.043787629994526546, "step": 40}, {"loss": 1.3036, "grad_norm": 0.43292930722236633, "learning_rate": 0.0002, "epoch": 0.05473453749315818, "step": 50}, {"loss": 1.2427, "grad_norm": 0.3431817591190338, "learning_rate": 0.0002, "epoch": 0.06568144499178982, "step": 60}, {"loss": 1.3076, "grad_norm": 0.38011446595191956, "learning_rate": 0.0002, "epoch": 0.07662835249042145, "step": 70}, {"loss": 1.1641, "grad_norm": 0.7441071271896362, "learning_rate": 0.0002, "epoch": 0.08757525998905309, "step": 80}, {"loss": 1.1721, "grad_norm": 0.3359833061695099, "learning_rate": 0.0002, "epoch": 0.09852216748768473, "step": 90}, {"loss": 1.2774, "grad_norm": 0.3724392354488373, "learning_rate": 0.0002, "epoch": 0.10946907498631636, "step": 100}, {"loss": 1.216, "grad_norm": 0.40673762559890747, "learning_rate": 0.0002, "epoch": 0.120415982484948, "step": 110}, {"loss": 1.2611, "grad_norm": 0.40036800503730774, "learning_rate": 0.0002, "epoch": 0.13136288998357964, "step": 120}, {"loss": 1.2436, "grad_norm": 2.844191312789917, "learning_rate": 0.0002, "epoch": 0.1423097974822113, "step": 130}, {"loss": 1.2254, "grad_norm": 0.3104734420776367, "learning_rate": 0.0002, "epoch": 0.1532567049808429, "step": 140}, {"loss": 1.0498, "grad_norm": 0.3266797959804535, "learning_rate": 0.0002, "epoch": 0.16420361247947454, "step": 150}, {"loss": 1.1644, "grad_norm": 0.3079199194908142, "learning_rate": 0.0002, "epoch": 0.17515051997810618, "step": 160}, {"loss": 1.2923, "grad_norm": 0.3872479498386383, "learning_rate": 0.0002, "epoch": 0.18609742747673783, "step": 170}, {"loss": 1.1809, "grad_norm": 0.38654500246047974, "learning_rate": 0.0002, "epoch": 0.19704433497536947, "step": 180}, {"loss": 1.066, "grad_norm": 0.2913552522659302, "learning_rate": 0.0002, "epoch": 0.20799124247400108, "step": 190}, {"loss": 1.0868, "grad_norm": 0.2960572838783264, "learning_rate": 0.0002, "epoch": 0.21893814997263272, "step": 200}, {"loss": 1.136, "grad_norm": 0.5175501108169556, "learning_rate": 0.0002, "epoch": 0.22988505747126436, "step": 210}, {"loss": 1.1556, "grad_norm": 1.2921574115753174, "learning_rate": 0.0002, "epoch": 0.240831964969896, "step": 220}, {"loss": 1.1488, "grad_norm": 0.2675512135028839, "learning_rate": 0.0002, "epoch": 0.25177887246852765, "step": 230}, {"loss": 1.2764, "grad_norm": 0.3956190049648285, "learning_rate": 0.0002, "epoch": 0.2627257799671593, "step": 240}, {"loss": 1.1889, "grad_norm": 0.6022581458091736, "learning_rate": 0.0002, "epoch": 0.27367268746579093, "step": 250}, {"loss": 1.1981, "grad_norm": 1.1949563026428223, "learning_rate": 0.0002, "epoch": 0.2846195949644226, "step": 260}, {"loss": 1.1877, "grad_norm": 0.31173548102378845, "learning_rate": 0.0002, "epoch": 0.2955665024630542, "step": 270}, {"loss": 1.06, "grad_norm": 0.2808472812175751, "learning_rate": 0.0002, "epoch": 0.3065134099616858, "step": 280}, {"loss": 1.1752, "grad_norm": 0.3042023777961731, "learning_rate": 0.0002, "epoch": 0.31746031746031744, "step": 290}, {"loss": 1.3147, "grad_norm": 0.39915043115615845, "learning_rate": 0.0002, "epoch": 0.3284072249589491, "step": 300}, {"loss": 1.2425, "grad_norm": 0.39118197560310364, "learning_rate": 0.0002, "epoch": 0.3393541324575807, "step": 310}, {"loss": 1.1363, "grad_norm": 0.355010986328125, "learning_rate": 0.0002, "epoch": 0.35030103995621237, "step": 320}, {"loss": 1.1925, "grad_norm": 0.29734086990356445, "learning_rate": 0.0002, "epoch": 0.361247947454844, "step": 330}, {"loss": 1.1974, "grad_norm": 0.346096009016037, "learning_rate": 0.0002, "epoch": 0.37219485495347565, "step": 340}, {"loss": 1.1641, "grad_norm": 0.4829643666744232, "learning_rate": 0.0002, "epoch": 0.3831417624521073, "step": 350}, {"loss": 1.2808, "grad_norm": 0.4726872742176056, "learning_rate": 0.0002, "epoch": 0.39408866995073893, "step": 360}, {"loss": 1.1532, "grad_norm": 0.3130153715610504, "learning_rate": 0.0002, "epoch": 0.4050355774493706, "step": 370}, {"loss": 1.1842, "grad_norm": 0.5123590230941772, "learning_rate": 0.0002, "epoch": 0.41598248494800216, "step": 380}, {"loss": 1.1539, "grad_norm": 0.3444574773311615, "learning_rate": 0.0002, "epoch": 0.4269293924466338, "step": 390}, {"loss": 1.1756, "grad_norm": 0.5302175283432007, "learning_rate": 0.0002, "epoch": 0.43787629994526545, "step": 400}, {"loss": 1.1138, "grad_norm": 0.2713572680950165, "learning_rate": 0.0002, "epoch": 0.4488232074438971, "step": 410}, {"loss": 1.1281, "grad_norm": 0.29530611634254456, "learning_rate": 0.0002, "epoch": 0.45977011494252873, "step": 420}, {"loss": 1.1721, "grad_norm": 0.27282455563545227, "learning_rate": 0.0002, "epoch": 0.47071702244116037, "step": 430}, {"loss": 1.1213, "grad_norm": 0.2647949755191803, "learning_rate": 0.0002, "epoch": 0.481663929939792, "step": 440}, {"loss": 1.1656, "grad_norm": 0.35509347915649414, "learning_rate": 0.0002, "epoch": 0.49261083743842365, "step": 450}, {"loss": 1.1251, "grad_norm": 0.1959609091281891, "learning_rate": 0.0002, "epoch": 0.5035577449370553, "step": 460}, {"loss": 1.1889, "grad_norm": 0.40090155601501465, "learning_rate": 0.0002, "epoch": 0.5145046524356869, "step": 470}, {"loss": 1.156, "grad_norm": 0.3354604244232178, "learning_rate": 0.0002, "epoch": 0.5254515599343186, "step": 480}, {"loss": 1.1024, "grad_norm": 0.2758506238460541, "learning_rate": 0.0002, "epoch": 0.5363984674329502, "step": 490}, {"loss": 1.1108, "grad_norm": 0.3629051744937897, "learning_rate": 0.0002, "epoch": 0.5473453749315819, "step": 500}, {"loss": 1.2236, "grad_norm": 0.30802229046821594, "learning_rate": 0.0002, "epoch": 0.5582922824302134, "step": 510}, {"loss": 1.0424, "grad_norm": 0.3099463880062103, "learning_rate": 0.0002, "epoch": 0.5692391899288451, "step": 520}, {"loss": 1.255, "grad_norm": 0.42299067974090576, "learning_rate": 0.0002, "epoch": 0.5801860974274767, "step": 530}, {"loss": 1.1698, "grad_norm": 0.5392252802848816, "learning_rate": 0.0002, "epoch": 0.5911330049261084, "step": 540}, {"loss": 1.1171, "grad_norm": 0.34768250584602356, "learning_rate": 0.0002, "epoch": 0.60207991242474, "step": 550}, {"loss": 1.102, "grad_norm": 0.28490015864372253, "learning_rate": 0.0002, "epoch": 0.6130268199233716, "step": 560}, {"loss": 1.1111, "grad_norm": 0.34787994623184204, "learning_rate": 0.0002, "epoch": 0.6239737274220033, "step": 570}, {"loss": 1.0759, "grad_norm": 0.29058772325515747, "learning_rate": 0.0002, "epoch": 0.6349206349206349, "step": 580}, {"loss": 1.1157, "grad_norm": 0.4063778817653656, "learning_rate": 0.0002, "epoch": 0.6458675424192666, "step": 590}, {"loss": 1.1432, "grad_norm": 0.9244267344474792, "learning_rate": 0.0002, "epoch": 0.6568144499178982, "step": 600}, {"loss": 1.0591, "grad_norm": 0.27605190873146057, "learning_rate": 0.0002, "epoch": 0.6677613574165299, "step": 610}, {"loss": 1.2123, "grad_norm": 0.34346821904182434, "learning_rate": 0.0002, "epoch": 0.6787082649151615, "step": 620}, {"loss": 1.2195, "grad_norm": 0.3093279302120209, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 630}, {"loss": 1.2461, "grad_norm": 1.0069009065628052, "learning_rate": 0.0002, "epoch": 0.7006020799124247, "step": 640}, {"loss": 1.0892, "grad_norm": 0.5049130916595459, "learning_rate": 0.0002, "epoch": 0.7115489874110563, "step": 650}, {"loss": 1.1404, "grad_norm": 0.3748924732208252, "learning_rate": 0.0002, "epoch": 0.722495894909688, "step": 660}, {"loss": 1.1062, "grad_norm": 0.2964959144592285, "learning_rate": 0.0002, "epoch": 0.7334428024083196, "step": 670}, {"loss": 1.2617, "grad_norm": 0.4599986970424652, "learning_rate": 0.0002, "epoch": 0.7443897099069513, "step": 680}, {"loss": 1.088, "grad_norm": 0.27292951941490173, "learning_rate": 0.0002, "epoch": 0.7553366174055829, "step": 690}, {"loss": 1.2047, "grad_norm": 0.3123566806316376, "learning_rate": 0.0002, "epoch": 0.7662835249042146, "step": 700}, {"loss": 1.0021, "grad_norm": 0.28310710191726685, "learning_rate": 0.0002, "epoch": 0.7772304324028462, "step": 710}, {"loss": 1.2281, "grad_norm": 0.3279992341995239, "learning_rate": 0.0002, "epoch": 0.7881773399014779, "step": 720}, {"loss": 1.25, "grad_norm": 0.28179168701171875, "learning_rate": 0.0002, "epoch": 0.7991242474001095, "step": 730}, {"loss": 1.0602, "grad_norm": 0.31492987275123596, "learning_rate": 0.0002, "epoch": 0.8100711548987412, "step": 740}, {"loss": 1.2518, "grad_norm": 0.41821011900901794, "learning_rate": 0.0002, "epoch": 0.8210180623973727, "step": 750}, {"loss": 1.1612, "grad_norm": 0.325235515832901, "learning_rate": 0.0002, "epoch": 0.8319649698960043, "step": 760}, {"loss": 1.27, "grad_norm": 0.5366070866584778, "learning_rate": 0.0002, "epoch": 0.842911877394636, "step": 770}, {"loss": 1.0921, "grad_norm": 0.32570579648017883, "learning_rate": 0.0002, "epoch": 0.8538587848932676, "step": 780}, {"loss": 1.1032, "grad_norm": 0.3642968237400055, "learning_rate": 0.0002, "epoch": 0.8648056923918993, "step": 790}, {"loss": 1.1234, "grad_norm": 0.29713448882102966, "learning_rate": 0.0002, "epoch": 0.8757525998905309, "step": 800}, {"loss": 1.0978, "grad_norm": 0.23599444329738617, "learning_rate": 0.0002, "epoch": 0.8866995073891626, "step": 810}, {"loss": 1.1867, "grad_norm": 0.31522464752197266, "learning_rate": 0.0002, "epoch": 0.8976464148877942, "step": 820}, {"loss": 1.0208, "grad_norm": 0.32754790782928467, "learning_rate": 0.0002, "epoch": 0.9085933223864259, "step": 830}, {"loss": 0.9786, "grad_norm": 0.22741089761257172, "learning_rate": 0.0002, "epoch": 0.9195402298850575, "step": 840}, {"loss": 1.0689, "grad_norm": 0.3089679777622223, "learning_rate": 0.0002, "epoch": 0.9304871373836892, "step": 850}, {"loss": 1.0354, "grad_norm": 0.27440521121025085, "learning_rate": 0.0002, "epoch": 0.9414340448823207, "step": 860}, {"loss": 1.0417, "grad_norm": 0.3498363792896271, "learning_rate": 0.0002, "epoch": 0.9523809523809523, "step": 870}, {"loss": 1.269, "grad_norm": 0.47151854634284973, "learning_rate": 0.0002, "epoch": 0.963327859879584, "step": 880}, {"loss": 1.1174, "grad_norm": 0.24756591022014618, "learning_rate": 0.0002, "epoch": 0.9742747673782156, "step": 890}, {"loss": 1.0622, "grad_norm": 0.2600938677787781, "learning_rate": 0.0002, "epoch": 0.9852216748768473, "step": 900}, {"loss": 1.0968, "grad_norm": 0.2934586703777313, "learning_rate": 0.0002, "epoch": 0.9961685823754789, "step": 910}, {"eval_loss": 1.158464789390564, "eval_runtime": 46.0774, "eval_samples_per_second": 9.462, "eval_steps_per_second": 1.194, "epoch": 0.9994526546250684, "step": 913}, {"loss": 1.0322, "grad_norm": 0.37776654958724976, "learning_rate": 0.0002, "epoch": 1.0071154898741106, "step": 920}, {"loss": 1.1346, "grad_norm": 0.31784629821777344, "learning_rate": 0.0002, "epoch": 1.0180623973727423, "step": 930}, {"loss": 0.8976, "grad_norm": 0.24244336783885956, "learning_rate": 0.0002, "epoch": 1.0290093048713738, "step": 940}, {"loss": 1.0172, "grad_norm": 0.3185454308986664, "learning_rate": 0.0002, "epoch": 1.0399562123700055, "step": 950}, {"loss": 1.1645, "grad_norm": 0.3589441478252411, "learning_rate": 0.0002, "epoch": 1.0509031198686372, "step": 960}, {"loss": 1.0317, "grad_norm": 0.38593578338623047, "learning_rate": 0.0002, "epoch": 1.0618500273672686, "step": 970}, {"loss": 0.9913, "grad_norm": 0.39694955945014954, "learning_rate": 0.0002, "epoch": 1.0727969348659003, "step": 980}, {"loss": 1.0893, "grad_norm": 0.469817191362381, "learning_rate": 0.0002, "epoch": 1.083743842364532, "step": 990}, {"loss": 0.983, "grad_norm": 0.2634755074977875, "learning_rate": 0.0002, "epoch": 1.0946907498631637, "step": 1000}, {"loss": 1.0144, "grad_norm": 0.43189436197280884, "learning_rate": 0.0002, "epoch": 1.1056376573617952, "step": 1010}, {"loss": 0.9663, "grad_norm": 0.5559977889060974, "learning_rate": 0.0002, "epoch": 1.116584564860427, "step": 1020}, {"loss": 1.0481, "grad_norm": 0.32100191712379456, "learning_rate": 0.0002, "epoch": 1.1275314723590586, "step": 1030}, {"loss": 1.1012, "grad_norm": 0.40179768204689026, "learning_rate": 0.0002, "epoch": 1.1384783798576903, "step": 1040}, {"loss": 1.0029, "grad_norm": 0.3659493029117584, "learning_rate": 0.0002, "epoch": 1.1494252873563218, "step": 1050}, {"loss": 0.9597, "grad_norm": 0.701704204082489, "learning_rate": 0.0002, "epoch": 1.1603721948549535, "step": 1060}, {"loss": 1.0204, "grad_norm": 0.3650563359260559, "learning_rate": 0.0002, "epoch": 1.1713191023535852, "step": 1070}, {"loss": 0.907, "grad_norm": 0.3191976249217987, "learning_rate": 0.0002, "epoch": 1.1822660098522166, "step": 1080}, {"loss": 1.0648, "grad_norm": 0.3615441918373108, "learning_rate": 0.0002, "epoch": 1.1932129173508483, "step": 1090}, {"loss": 1.0067, "grad_norm": 0.39474231004714966, "learning_rate": 0.0002, "epoch": 1.20415982484948, "step": 1100}, {"loss": 0.9852, "grad_norm": 0.3752822279930115, "learning_rate": 0.0002, "epoch": 1.2151067323481117, "step": 1110}, {"loss": 1.0067, "grad_norm": 0.4165991246700287, "learning_rate": 0.0002, "epoch": 1.2260536398467432, "step": 1120}, {"loss": 1.0244, "grad_norm": 0.5326506495475769, "learning_rate": 0.0002, "epoch": 1.237000547345375, "step": 1130}, {"loss": 1.0542, "grad_norm": 0.48845794796943665, "learning_rate": 0.0002, "epoch": 1.2479474548440066, "step": 1140}, {"loss": 1.0885, "grad_norm": 0.29910150170326233, "learning_rate": 0.0002, "epoch": 1.2588943623426383, "step": 1150}, {"loss": 1.2233, "grad_norm": 0.5069725513458252, "learning_rate": 0.0002, "epoch": 1.2698412698412698, "step": 1160}, {"loss": 1.0992, "grad_norm": 0.29500406980514526, "learning_rate": 0.0002, "epoch": 1.2807881773399015, "step": 1170}, {"loss": 1.0291, "grad_norm": 0.4711538851261139, "learning_rate": 0.0002, "epoch": 1.2917350848385332, "step": 1180}, {"loss": 1.0513, "grad_norm": 0.4203340709209442, "learning_rate": 0.0002, "epoch": 1.3026819923371646, "step": 1190}, {"loss": 1.1274, "grad_norm": 0.36101874709129333, "learning_rate": 0.0002, "epoch": 1.3136288998357963, "step": 1200}, {"loss": 1.085, "grad_norm": 0.4608800411224365, "learning_rate": 0.0002, "epoch": 1.324575807334428, "step": 1210}, {"loss": 1.0695, "grad_norm": 0.6570906639099121, "learning_rate": 0.0002, "epoch": 1.3355227148330597, "step": 1220}, {"loss": 0.9786, "grad_norm": 0.5352164506912231, "learning_rate": 0.0002, "epoch": 1.3464696223316914, "step": 1230}, {"loss": 1.0513, "grad_norm": 0.3885001242160797, "learning_rate": 0.0002, "epoch": 1.357416529830323, "step": 1240}, {"loss": 1.0611, "grad_norm": 0.2987913489341736, "learning_rate": 0.0002, "epoch": 1.3683634373289546, "step": 1250}, {"loss": 1.109, "grad_norm": 0.42070427536964417, "learning_rate": 0.0002, "epoch": 1.3793103448275863, "step": 1260}, {"loss": 1.1092, "grad_norm": 0.5957782864570618, "learning_rate": 0.0002, "epoch": 1.3902572523262178, "step": 1270}, {"loss": 1.0554, "grad_norm": 0.32898882031440735, "learning_rate": 0.0002, "epoch": 1.4012041598248495, "step": 1280}, {"loss": 0.909, "grad_norm": 0.27624452114105225, "learning_rate": 0.0002, "epoch": 1.4121510673234812, "step": 1290}, {"loss": 1.024, "grad_norm": 0.49570828676223755, "learning_rate": 0.0002, "epoch": 1.4230979748221126, "step": 1300}, {"loss": 1.0471, "grad_norm": 0.26191383600234985, "learning_rate": 0.0002, "epoch": 1.4340448823207443, "step": 1310}, {"loss": 1.0788, "grad_norm": 0.35664042830467224, "learning_rate": 0.0002, "epoch": 1.444991789819376, "step": 1320}, {"loss": 1.0108, "grad_norm": 0.45126354694366455, "learning_rate": 0.0002, "epoch": 1.4559386973180077, "step": 1330}, {"loss": 0.9571, "grad_norm": 0.37318357825279236, "learning_rate": 0.0002, "epoch": 1.4668856048166394, "step": 1340}, {"loss": 1.0507, "grad_norm": 0.6428970098495483, "learning_rate": 0.0002, "epoch": 1.477832512315271, "step": 1350}, {"loss": 1.0367, "grad_norm": 0.43256187438964844, "learning_rate": 0.0002, "epoch": 1.4887794198139026, "step": 1360}, {"loss": 1.1321, "grad_norm": 0.5343793630599976, "learning_rate": 0.0002, "epoch": 1.4997263273125343, "step": 1370}, {"loss": 1.1054, "grad_norm": 0.315437376499176, "learning_rate": 0.0002, "epoch": 1.5106732348111658, "step": 1380}, {"loss": 0.8916, "grad_norm": 0.41561153531074524, "learning_rate": 0.0002, "epoch": 1.5216201423097975, "step": 1390}, {"loss": 1.0391, "grad_norm": 0.3201070725917816, "learning_rate": 0.0002, "epoch": 1.5325670498084292, "step": 1400}, {"loss": 1.14, "grad_norm": 0.505537211894989, "learning_rate": 0.0002, "epoch": 1.5435139573070606, "step": 1410}, {"loss": 1.0775, "grad_norm": 0.3747410178184509, "learning_rate": 0.0002, "epoch": 1.5544608648056923, "step": 1420}, {"loss": 1.1171, "grad_norm": 0.49385908246040344, "learning_rate": 0.0002, "epoch": 1.565407772304324, "step": 1430}, {"loss": 1.1182, "grad_norm": 0.49831628799438477, "learning_rate": 0.0002, "epoch": 1.5763546798029555, "step": 1440}, {"loss": 1.0079, "grad_norm": 0.372127890586853, "learning_rate": 0.0002, "epoch": 1.5873015873015874, "step": 1450}, {"loss": 0.9931, "grad_norm": 0.40070840716362, "learning_rate": 0.0002, "epoch": 1.598248494800219, "step": 1460}, {"loss": 0.8954, "grad_norm": 0.34907400608062744, "learning_rate": 0.0002, "epoch": 1.6091954022988506, "step": 1470}, {"loss": 0.9743, "grad_norm": 0.4632418751716614, "learning_rate": 0.0002, "epoch": 1.6201423097974823, "step": 1480}, {"loss": 1.0103, "grad_norm": 0.40164515376091003, "learning_rate": 0.0002, "epoch": 1.6310892172961138, "step": 1490}, {"loss": 0.9523, "grad_norm": 0.3214994966983795, "learning_rate": 0.0002, "epoch": 1.6420361247947455, "step": 1500}, {"loss": 1.0161, "grad_norm": 0.3727897107601166, "learning_rate": 0.0002, "epoch": 1.6529830322933772, "step": 1510}, {"loss": 1.0443, "grad_norm": 0.3817640542984009, "learning_rate": 0.0002, "epoch": 1.6639299397920086, "step": 1520}, {"loss": 1.0511, "grad_norm": 0.5592136979103088, "learning_rate": 0.0002, "epoch": 1.6748768472906403, "step": 1530}, {"loss": 0.9682, "grad_norm": 0.44636598229408264, "learning_rate": 0.0002, "epoch": 1.685823754789272, "step": 1540}, {"loss": 1.033, "grad_norm": 0.40441709756851196, "learning_rate": 0.0002, "epoch": 1.6967706622879035, "step": 1550}, {"loss": 1.0857, "grad_norm": 0.3243522644042969, "learning_rate": 0.0002, "epoch": 1.7077175697865354, "step": 1560}, {"loss": 0.9482, "grad_norm": 0.34277570247650146, "learning_rate": 0.0002, "epoch": 1.718664477285167, "step": 1570}, {"loss": 0.974, "grad_norm": 0.3279995024204254, "learning_rate": 0.0002, "epoch": 1.7296113847837986, "step": 1580}, {"loss": 0.9414, "grad_norm": 0.41968777775764465, "learning_rate": 0.0002, "epoch": 1.7405582922824303, "step": 1590}, {"loss": 0.9768, "grad_norm": 0.39464613795280457, "learning_rate": 0.0002, "epoch": 1.7515051997810618, "step": 1600}, {"loss": 1.0347, "grad_norm": 0.3839009404182434, "learning_rate": 0.0002, "epoch": 1.7624521072796935, "step": 1610}, {"loss": 0.9195, "grad_norm": 0.3250715434551239, "learning_rate": 0.0002, "epoch": 1.7733990147783252, "step": 1620}, {"loss": 1.0049, "grad_norm": 0.5166561007499695, "learning_rate": 0.0002, "epoch": 1.7843459222769567, "step": 1630}, {"loss": 1.0364, "grad_norm": 0.4115183353424072, "learning_rate": 0.0002, "epoch": 1.7952928297755884, "step": 1640}, {"loss": 1.0248, "grad_norm": 0.373780220746994, "learning_rate": 0.0002, "epoch": 1.80623973727422, "step": 1650}, {"loss": 1.0984, "grad_norm": 0.49697014689445496, "learning_rate": 0.0002, "epoch": 1.8171866447728515, "step": 1660}, {"loss": 1.0089, "grad_norm": 1.0308938026428223, "learning_rate": 0.0002, "epoch": 1.8281335522714834, "step": 1670}, {"loss": 1.0853, "grad_norm": 0.4851366877555847, "learning_rate": 0.0002, "epoch": 1.839080459770115, "step": 1680}, {"loss": 0.9533, "grad_norm": 0.3262481391429901, "learning_rate": 0.0002, "epoch": 1.8500273672687466, "step": 1690}, {"loss": 1.048, "grad_norm": 0.6904496550559998, "learning_rate": 0.0002, "epoch": 1.8609742747673783, "step": 1700}, {"loss": 1.0577, "grad_norm": 0.49789851903915405, "learning_rate": 0.0002, "epoch": 1.8719211822660098, "step": 1710}, {"loss": 1.0258, "grad_norm": 0.3035794198513031, "learning_rate": 0.0002, "epoch": 1.8828680897646415, "step": 1720}, {"loss": 0.9916, "grad_norm": 0.4588414430618286, "learning_rate": 0.0002, "epoch": 1.8938149972632732, "step": 1730}, {"loss": 0.9526, "grad_norm": 0.4313034117221832, "learning_rate": 0.0002, "epoch": 1.9047619047619047, "step": 1740}, {"loss": 1.0857, "grad_norm": 0.38562044501304626, "learning_rate": 0.0002, "epoch": 1.9157088122605364, "step": 1750}, {"loss": 0.9763, "grad_norm": 0.46947410702705383, "learning_rate": 0.0002, "epoch": 1.926655719759168, "step": 1760}, {"loss": 0.9967, "grad_norm": 0.3848404884338379, "learning_rate": 0.0002, "epoch": 1.9376026272577995, "step": 1770}, {"loss": 1.0474, "grad_norm": 0.30422744154930115, "learning_rate": 0.0002, "epoch": 1.9485495347564314, "step": 1780}, {"loss": 1.022, "grad_norm": 0.41100990772247314, "learning_rate": 0.0002, "epoch": 1.959496442255063, "step": 1790}, {"loss": 1.003, "grad_norm": 0.3492335081100464, "learning_rate": 0.0002, "epoch": 1.9704433497536946, "step": 1800}, {"loss": 0.9891, "grad_norm": 0.364577978849411, "learning_rate": 0.0002, "epoch": 1.9813902572523263, "step": 1810}, {"loss": 1.0699, "grad_norm": 0.4312075674533844, "learning_rate": 0.0002, "epoch": 1.9923371647509578, "step": 1820}, {"eval_loss": 1.14472496509552, "eval_runtime": 46.0786, "eval_samples_per_second": 9.462, "eval_steps_per_second": 1.194, "epoch": 2.0, "step": 1827}, {"loss": 1.0218, "grad_norm": 0.5989689230918884, "learning_rate": 0.0002, "epoch": 2.0032840722495897, "step": 1830}, {"loss": 0.9384, "grad_norm": 0.49720922112464905, "learning_rate": 0.0002, "epoch": 2.014230979748221, "step": 1840}, {"loss": 0.8482, "grad_norm": 0.42675456404685974, "learning_rate": 0.0002, "epoch": 2.0251778872468527, "step": 1850}, {"loss": 0.8487, "grad_norm": 0.4637208580970764, "learning_rate": 0.0002, "epoch": 2.0361247947454846, "step": 1860}, {"loss": 0.8212, "grad_norm": 0.8329976797103882, "learning_rate": 0.0002, "epoch": 2.047071702244116, "step": 1870}, {"loss": 0.9547, "grad_norm": 0.7869427800178528, "learning_rate": 0.0002, "epoch": 2.0580186097427475, "step": 1880}, {"loss": 0.9351, "grad_norm": 0.4927455186843872, "learning_rate": 0.0002, "epoch": 2.0689655172413794, "step": 1890}, {"loss": 0.8652, "grad_norm": 0.6264246702194214, "learning_rate": 0.0002, "epoch": 2.079912424740011, "step": 1900}, {"loss": 0.9095, "grad_norm": 1.1164122819900513, "learning_rate": 0.0002, "epoch": 2.0908593322386424, "step": 1910}, {"loss": 0.7823, "grad_norm": 0.5283981561660767, "learning_rate": 0.0002, "epoch": 2.1018062397372743, "step": 1920}, {"loss": 0.8065, "grad_norm": 0.45621731877326965, "learning_rate": 0.0002, "epoch": 2.112753147235906, "step": 1930}, {"loss": 0.9184, "grad_norm": 1.381791591644287, "learning_rate": 0.0002, "epoch": 2.1237000547345373, "step": 1940}, {"loss": 0.9006, "grad_norm": 0.5151259899139404, "learning_rate": 0.0002, "epoch": 2.134646962233169, "step": 1950}, {"loss": 0.8436, "grad_norm": 0.9806339740753174, "learning_rate": 0.0002, "epoch": 2.1455938697318007, "step": 1960}, {"loss": 0.8749, "grad_norm": 0.4734154939651489, "learning_rate": 0.0002, "epoch": 2.1565407772304326, "step": 1970}, {"loss": 0.9172, "grad_norm": 0.9553168416023254, "learning_rate": 0.0002, "epoch": 2.167487684729064, "step": 1980}, {"loss": 0.8047, "grad_norm": 0.5895838141441345, "learning_rate": 0.0002, "epoch": 2.1784345922276955, "step": 1990}, {"loss": 0.7841, "grad_norm": 0.4488855302333832, "learning_rate": 0.0002, "epoch": 2.1893814997263275, "step": 2000}, {"loss": 0.8205, "grad_norm": 1.0760235786437988, "learning_rate": 0.0002, "epoch": 2.200328407224959, "step": 2010}, {"loss": 0.7923, "grad_norm": 0.5038785338401794, "learning_rate": 0.0002, "epoch": 2.2112753147235904, "step": 2020}, {"loss": 0.8973, "grad_norm": 0.59819495677948, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 2030}, {"loss": 0.8009, "grad_norm": 0.5012075304985046, "learning_rate": 0.0002, "epoch": 2.233169129720854, "step": 2040}, {"loss": 0.9087, "grad_norm": 0.44978439807891846, "learning_rate": 0.0002, "epoch": 2.2441160372194853, "step": 2050}, {"loss": 0.823, "grad_norm": 0.5350462198257446, "learning_rate": 0.0002, "epoch": 2.255062944718117, "step": 2060}, {"loss": 0.8335, "grad_norm": 0.6020669937133789, "learning_rate": 0.0002, "epoch": 2.2660098522167487, "step": 2070}, {"loss": 0.8023, "grad_norm": 0.5246821045875549, "learning_rate": 0.0002, "epoch": 2.2769567597153806, "step": 2080}, {"loss": 0.8984, "grad_norm": 0.5711268782615662, "learning_rate": 0.0002, "epoch": 2.287903667214012, "step": 2090}, {"loss": 0.9093, "grad_norm": 0.617317259311676, "learning_rate": 0.0002, "epoch": 2.2988505747126435, "step": 2100}, {"loss": 0.8311, "grad_norm": 0.8608947396278381, "learning_rate": 0.0002, "epoch": 2.3097974822112755, "step": 2110}, {"loss": 0.7839, "grad_norm": 0.4739076793193817, "learning_rate": 0.0002, "epoch": 2.320744389709907, "step": 2120}, {"loss": 0.84, "grad_norm": 0.5538856983184814, "learning_rate": 0.0002, "epoch": 2.3316912972085384, "step": 2130}, {"loss": 0.8994, "grad_norm": 0.6064935326576233, "learning_rate": 0.0002, "epoch": 2.3426382047071703, "step": 2140}, {"loss": 0.7765, "grad_norm": 0.5019068121910095, "learning_rate": 0.0002, "epoch": 2.353585112205802, "step": 2150}, {"loss": 0.9576, "grad_norm": 0.45340514183044434, "learning_rate": 0.0002, "epoch": 2.3645320197044333, "step": 2160}, {"loss": 0.9254, "grad_norm": 0.7347203493118286, "learning_rate": 0.0002, "epoch": 2.375478927203065, "step": 2170}, {"loss": 0.8507, "grad_norm": 0.46922534704208374, "learning_rate": 0.0002, "epoch": 2.3864258347016967, "step": 2180}, {"loss": 0.8325, "grad_norm": 0.5507845878601074, "learning_rate": 0.0002, "epoch": 2.3973727422003286, "step": 2190}, {"loss": 0.8178, "grad_norm": 0.5621911883354187, "learning_rate": 0.0002, "epoch": 2.40831964969896, "step": 2200}, {"loss": 0.8569, "grad_norm": 0.5023514032363892, "learning_rate": 0.0002, "epoch": 2.4192665571975915, "step": 2210}, {"loss": 0.8355, "grad_norm": 0.6124861240386963, "learning_rate": 0.0002, "epoch": 2.4302134646962235, "step": 2220}, {"loss": 0.885, "grad_norm": 0.49614205956459045, "learning_rate": 0.0002, "epoch": 2.441160372194855, "step": 2230}, {"loss": 0.8008, "grad_norm": 0.6477900743484497, "learning_rate": 0.0002, "epoch": 2.4521072796934864, "step": 2240}, {"loss": 0.8622, "grad_norm": 0.5868843793869019, "learning_rate": 0.0002, "epoch": 2.4630541871921183, "step": 2250}, {"loss": 0.8498, "grad_norm": 0.4364610016345978, "learning_rate": 0.0002, "epoch": 2.47400109469075, "step": 2260}, {"loss": 0.8378, "grad_norm": 0.5792964696884155, "learning_rate": 0.0002, "epoch": 2.4849480021893813, "step": 2270}, {"loss": 0.8743, "grad_norm": 0.5421269536018372, "learning_rate": 0.0002, "epoch": 2.495894909688013, "step": 2280}, {"loss": 0.9637, "grad_norm": 0.5525493025779724, "learning_rate": 0.0002, "epoch": 2.5068418171866447, "step": 2290}, {"loss": 0.8075, "grad_norm": 0.6463850140571594, "learning_rate": 0.0002, "epoch": 2.5177887246852766, "step": 2300}, {"loss": 0.8591, "grad_norm": 0.6861311793327332, "learning_rate": 0.0002, "epoch": 2.528735632183908, "step": 2310}, {"loss": 0.9287, "grad_norm": 0.5563555359840393, "learning_rate": 0.0002, "epoch": 2.5396825396825395, "step": 2320}, {"loss": 0.945, "grad_norm": 0.5721169114112854, "learning_rate": 0.0002, "epoch": 2.5506294471811715, "step": 2330}, {"loss": 0.8271, "grad_norm": 0.5258274674415588, "learning_rate": 0.0002, "epoch": 2.561576354679803, "step": 2340}, {"loss": 0.8515, "grad_norm": 0.7057380676269531, "learning_rate": 0.0002, "epoch": 2.572523262178435, "step": 2350}, {"loss": 0.8615, "grad_norm": 0.6869027614593506, "learning_rate": 0.0002, "epoch": 2.5834701696770663, "step": 2360}, {"loss": 0.8043, "grad_norm": 0.4960809648036957, "learning_rate": 0.0002, "epoch": 2.594417077175698, "step": 2370}, {"loss": 0.8476, "grad_norm": 0.9288380146026611, "learning_rate": 0.0002, "epoch": 2.6053639846743293, "step": 2380}, {"loss": 0.873, "grad_norm": 0.3765334188938141, "learning_rate": 0.0002, "epoch": 2.616310892172961, "step": 2390}, {"loss": 0.8764, "grad_norm": 0.7487865686416626, "learning_rate": 0.0002, "epoch": 2.6272577996715927, "step": 2400}, {"loss": 0.7577, "grad_norm": 0.6141156554222107, "learning_rate": 0.0002, "epoch": 2.6382047071702246, "step": 2410}, {"loss": 0.8534, "grad_norm": 0.8420507907867432, "learning_rate": 0.0002, "epoch": 2.649151614668856, "step": 2420}, {"loss": 0.8311, "grad_norm": 0.53386390209198, "learning_rate": 0.0002, "epoch": 2.6600985221674875, "step": 2430}, {"loss": 0.8486, "grad_norm": 0.5520607233047485, "learning_rate": 0.0002, "epoch": 2.6710454296661195, "step": 2440}, {"loss": 0.8686, "grad_norm": 0.5337599515914917, "learning_rate": 0.0002, "epoch": 2.681992337164751, "step": 2450}, {"loss": 0.792, "grad_norm": 0.48790836334228516, "learning_rate": 0.0002, "epoch": 2.692939244663383, "step": 2460}, {"loss": 0.7864, "grad_norm": 0.8287786245346069, "learning_rate": 0.0002, "epoch": 2.7038861521620143, "step": 2470}, {"loss": 0.8244, "grad_norm": 0.5876168608665466, "learning_rate": 0.0002, "epoch": 2.714833059660646, "step": 2480}, {"loss": 0.8773, "grad_norm": 0.5206760764122009, "learning_rate": 0.0002, "epoch": 2.7257799671592773, "step": 2490}, {"loss": 0.8097, "grad_norm": 0.5619136691093445, "learning_rate": 0.0002, "epoch": 2.736726874657909, "step": 2500}, {"loss": 0.8377, "grad_norm": 0.5614883899688721, "learning_rate": 0.0002, "epoch": 2.7476737821565407, "step": 2510}, {"loss": 0.8817, "grad_norm": 0.6157700419425964, "learning_rate": 0.0002, "epoch": 2.7586206896551726, "step": 2520}, {"loss": 0.846, "grad_norm": 0.5529953837394714, "learning_rate": 0.0002, "epoch": 2.769567597153804, "step": 2530}, {"loss": 0.8881, "grad_norm": 0.6731224060058594, "learning_rate": 0.0002, "epoch": 2.7805145046524355, "step": 2540}, {"loss": 0.9125, "grad_norm": 0.6960386633872986, "learning_rate": 0.0002, "epoch": 2.7914614121510675, "step": 2550}, {"loss": 0.7823, "grad_norm": 0.5203493237495422, "learning_rate": 0.0002, "epoch": 2.802408319649699, "step": 2560}, {"loss": 0.7951, "grad_norm": 1.036837100982666, "learning_rate": 0.0002, "epoch": 2.813355227148331, "step": 2570}, {"loss": 0.8427, "grad_norm": 0.6125805377960205, "learning_rate": 0.0002, "epoch": 2.8243021346469623, "step": 2580}, {"loss": 0.8939, "grad_norm": 0.6298092603683472, "learning_rate": 0.0002, "epoch": 2.835249042145594, "step": 2590}, {"loss": 0.8845, "grad_norm": 0.5882203578948975, "learning_rate": 0.0002, "epoch": 2.8461959496442253, "step": 2600}, {"loss": 0.8921, "grad_norm": 0.8619399666786194, "learning_rate": 0.0002, "epoch": 2.857142857142857, "step": 2610}, {"loss": 0.8213, "grad_norm": 0.4722687304019928, "learning_rate": 0.0002, "epoch": 2.8680897646414887, "step": 2620}, {"loss": 0.9147, "grad_norm": 0.47399574518203735, "learning_rate": 0.0002, "epoch": 2.8790366721401206, "step": 2630}, {"loss": 0.8325, "grad_norm": 0.5639172792434692, "learning_rate": 0.0002, "epoch": 2.889983579638752, "step": 2640}, {"loss": 0.8653, "grad_norm": 0.4676816761493683, "learning_rate": 0.0002, "epoch": 2.9009304871373836, "step": 2650}, {"loss": 0.8966, "grad_norm": 0.6906291246414185, "learning_rate": 0.0002, "epoch": 2.9118773946360155, "step": 2660}, {"loss": 0.8966, "grad_norm": 0.4369746148586273, "learning_rate": 0.0002, "epoch": 2.922824302134647, "step": 2670}, {"loss": 0.9173, "grad_norm": 0.46423083543777466, "learning_rate": 0.0002, "epoch": 2.933771209633279, "step": 2680}, {"loss": 0.7961, "grad_norm": 0.5700525045394897, "learning_rate": 0.0002, "epoch": 2.9447181171319103, "step": 2690}, {"loss": 0.8172, "grad_norm": 0.6221476793289185, "learning_rate": 0.0002, "epoch": 2.955665024630542, "step": 2700}, {"loss": 0.8538, "grad_norm": 0.6102682948112488, "learning_rate": 0.0002, "epoch": 2.9666119321291733, "step": 2710}, {"loss": 0.9779, "grad_norm": 0.5317878723144531, "learning_rate": 0.0002, "epoch": 2.977558839627805, "step": 2720}, {"loss": 0.9314, "grad_norm": 0.4438510835170746, "learning_rate": 0.0002, "epoch": 2.9885057471264367, "step": 2730}, {"loss": 0.8553, "grad_norm": 0.5022130012512207, "learning_rate": 0.0002, "epoch": 2.9994526546250686, "step": 2740}]} +{"epoch": 4.0, "step": 3654, "epoch_duration": 1363.5929102897644, "total_accumulated_duration": 5453.118346214294, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4039, "grad_norm": 0.7611560821533203, "learning_rate": 0.0002, "epoch": 0.010946907498631636, "step": 10}, {"loss": 1.8818, "grad_norm": 0.4633193612098694, "learning_rate": 0.0002, "epoch": 0.021893814997263273, "step": 20}, {"loss": 1.5927, "grad_norm": 0.49326154589653015, "learning_rate": 0.0002, "epoch": 0.03284072249589491, "step": 30}, {"loss": 1.3859, "grad_norm": 0.3943138122558594, "learning_rate": 0.0002, "epoch": 0.043787629994526546, "step": 40}, {"loss": 1.3036, "grad_norm": 0.43292930722236633, "learning_rate": 0.0002, "epoch": 0.05473453749315818, "step": 50}, {"loss": 1.2427, "grad_norm": 0.3431817591190338, "learning_rate": 0.0002, "epoch": 0.06568144499178982, "step": 60}, {"loss": 1.3076, "grad_norm": 0.38011446595191956, "learning_rate": 0.0002, "epoch": 0.07662835249042145, "step": 70}, {"loss": 1.1641, "grad_norm": 0.7441071271896362, "learning_rate": 0.0002, "epoch": 0.08757525998905309, "step": 80}, {"loss": 1.1721, "grad_norm": 0.3359833061695099, "learning_rate": 0.0002, "epoch": 0.09852216748768473, "step": 90}, {"loss": 1.2774, "grad_norm": 0.3724392354488373, "learning_rate": 0.0002, "epoch": 0.10946907498631636, "step": 100}, {"loss": 1.216, "grad_norm": 0.40673762559890747, "learning_rate": 0.0002, "epoch": 0.120415982484948, "step": 110}, {"loss": 1.2611, "grad_norm": 0.40036800503730774, "learning_rate": 0.0002, "epoch": 0.13136288998357964, "step": 120}, {"loss": 1.2436, "grad_norm": 2.844191312789917, "learning_rate": 0.0002, "epoch": 0.1423097974822113, "step": 130}, {"loss": 1.2254, "grad_norm": 0.3104734420776367, "learning_rate": 0.0002, "epoch": 0.1532567049808429, "step": 140}, {"loss": 1.0498, "grad_norm": 0.3266797959804535, "learning_rate": 0.0002, "epoch": 0.16420361247947454, "step": 150}, {"loss": 1.1644, "grad_norm": 0.3079199194908142, "learning_rate": 0.0002, "epoch": 0.17515051997810618, "step": 160}, {"loss": 1.2923, "grad_norm": 0.3872479498386383, "learning_rate": 0.0002, "epoch": 0.18609742747673783, "step": 170}, {"loss": 1.1809, "grad_norm": 0.38654500246047974, "learning_rate": 0.0002, "epoch": 0.19704433497536947, "step": 180}, {"loss": 1.066, "grad_norm": 0.2913552522659302, "learning_rate": 0.0002, "epoch": 0.20799124247400108, "step": 190}, {"loss": 1.0868, "grad_norm": 0.2960572838783264, "learning_rate": 0.0002, "epoch": 0.21893814997263272, "step": 200}, {"loss": 1.136, "grad_norm": 0.5175501108169556, "learning_rate": 0.0002, "epoch": 0.22988505747126436, "step": 210}, {"loss": 1.1556, "grad_norm": 1.2921574115753174, "learning_rate": 0.0002, "epoch": 0.240831964969896, "step": 220}, {"loss": 1.1488, "grad_norm": 0.2675512135028839, "learning_rate": 0.0002, "epoch": 0.25177887246852765, "step": 230}, {"loss": 1.2764, "grad_norm": 0.3956190049648285, "learning_rate": 0.0002, "epoch": 0.2627257799671593, "step": 240}, {"loss": 1.1889, "grad_norm": 0.6022581458091736, "learning_rate": 0.0002, "epoch": 0.27367268746579093, "step": 250}, {"loss": 1.1981, "grad_norm": 1.1949563026428223, "learning_rate": 0.0002, "epoch": 0.2846195949644226, "step": 260}, {"loss": 1.1877, "grad_norm": 0.31173548102378845, "learning_rate": 0.0002, "epoch": 0.2955665024630542, "step": 270}, {"loss": 1.06, "grad_norm": 0.2808472812175751, "learning_rate": 0.0002, "epoch": 0.3065134099616858, "step": 280}, {"loss": 1.1752, "grad_norm": 0.3042023777961731, "learning_rate": 0.0002, "epoch": 0.31746031746031744, "step": 290}, {"loss": 1.3147, "grad_norm": 0.39915043115615845, "learning_rate": 0.0002, "epoch": 0.3284072249589491, "step": 300}, {"loss": 1.2425, "grad_norm": 0.39118197560310364, "learning_rate": 0.0002, "epoch": 0.3393541324575807, "step": 310}, {"loss": 1.1363, "grad_norm": 0.355010986328125, "learning_rate": 0.0002, "epoch": 0.35030103995621237, "step": 320}, {"loss": 1.1925, "grad_norm": 0.29734086990356445, "learning_rate": 0.0002, "epoch": 0.361247947454844, "step": 330}, {"loss": 1.1974, "grad_norm": 0.346096009016037, "learning_rate": 0.0002, "epoch": 0.37219485495347565, "step": 340}, {"loss": 1.1641, "grad_norm": 0.4829643666744232, "learning_rate": 0.0002, "epoch": 0.3831417624521073, "step": 350}, {"loss": 1.2808, "grad_norm": 0.4726872742176056, "learning_rate": 0.0002, "epoch": 0.39408866995073893, "step": 360}, {"loss": 1.1532, "grad_norm": 0.3130153715610504, "learning_rate": 0.0002, "epoch": 0.4050355774493706, "step": 370}, {"loss": 1.1842, "grad_norm": 0.5123590230941772, "learning_rate": 0.0002, "epoch": 0.41598248494800216, "step": 380}, {"loss": 1.1539, "grad_norm": 0.3444574773311615, "learning_rate": 0.0002, "epoch": 0.4269293924466338, "step": 390}, {"loss": 1.1756, "grad_norm": 0.5302175283432007, "learning_rate": 0.0002, "epoch": 0.43787629994526545, "step": 400}, {"loss": 1.1138, "grad_norm": 0.2713572680950165, "learning_rate": 0.0002, "epoch": 0.4488232074438971, "step": 410}, {"loss": 1.1281, "grad_norm": 0.29530611634254456, "learning_rate": 0.0002, "epoch": 0.45977011494252873, "step": 420}, {"loss": 1.1721, "grad_norm": 0.27282455563545227, "learning_rate": 0.0002, "epoch": 0.47071702244116037, "step": 430}, {"loss": 1.1213, "grad_norm": 0.2647949755191803, "learning_rate": 0.0002, "epoch": 0.481663929939792, "step": 440}, {"loss": 1.1656, "grad_norm": 0.35509347915649414, "learning_rate": 0.0002, "epoch": 0.49261083743842365, "step": 450}, {"loss": 1.1251, "grad_norm": 0.1959609091281891, "learning_rate": 0.0002, "epoch": 0.5035577449370553, "step": 460}, {"loss": 1.1889, "grad_norm": 0.40090155601501465, "learning_rate": 0.0002, "epoch": 0.5145046524356869, "step": 470}, {"loss": 1.156, "grad_norm": 0.3354604244232178, "learning_rate": 0.0002, "epoch": 0.5254515599343186, "step": 480}, {"loss": 1.1024, "grad_norm": 0.2758506238460541, "learning_rate": 0.0002, "epoch": 0.5363984674329502, "step": 490}, {"loss": 1.1108, "grad_norm": 0.3629051744937897, "learning_rate": 0.0002, "epoch": 0.5473453749315819, "step": 500}, {"loss": 1.2236, "grad_norm": 0.30802229046821594, "learning_rate": 0.0002, "epoch": 0.5582922824302134, "step": 510}, {"loss": 1.0424, "grad_norm": 0.3099463880062103, "learning_rate": 0.0002, "epoch": 0.5692391899288451, "step": 520}, {"loss": 1.255, "grad_norm": 0.42299067974090576, "learning_rate": 0.0002, "epoch": 0.5801860974274767, "step": 530}, {"loss": 1.1698, "grad_norm": 0.5392252802848816, "learning_rate": 0.0002, "epoch": 0.5911330049261084, "step": 540}, {"loss": 1.1171, "grad_norm": 0.34768250584602356, "learning_rate": 0.0002, "epoch": 0.60207991242474, "step": 550}, {"loss": 1.102, "grad_norm": 0.28490015864372253, "learning_rate": 0.0002, "epoch": 0.6130268199233716, "step": 560}, {"loss": 1.1111, "grad_norm": 0.34787994623184204, "learning_rate": 0.0002, "epoch": 0.6239737274220033, "step": 570}, {"loss": 1.0759, "grad_norm": 0.29058772325515747, "learning_rate": 0.0002, "epoch": 0.6349206349206349, "step": 580}, {"loss": 1.1157, "grad_norm": 0.4063778817653656, "learning_rate": 0.0002, "epoch": 0.6458675424192666, "step": 590}, {"loss": 1.1432, "grad_norm": 0.9244267344474792, "learning_rate": 0.0002, "epoch": 0.6568144499178982, "step": 600}, {"loss": 1.0591, "grad_norm": 0.27605190873146057, "learning_rate": 0.0002, "epoch": 0.6677613574165299, "step": 610}, {"loss": 1.2123, "grad_norm": 0.34346821904182434, "learning_rate": 0.0002, "epoch": 0.6787082649151615, "step": 620}, {"loss": 1.2195, "grad_norm": 0.3093279302120209, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 630}, {"loss": 1.2461, "grad_norm": 1.0069009065628052, "learning_rate": 0.0002, "epoch": 0.7006020799124247, "step": 640}, {"loss": 1.0892, "grad_norm": 0.5049130916595459, "learning_rate": 0.0002, "epoch": 0.7115489874110563, "step": 650}, {"loss": 1.1404, "grad_norm": 0.3748924732208252, "learning_rate": 0.0002, "epoch": 0.722495894909688, "step": 660}, {"loss": 1.1062, "grad_norm": 0.2964959144592285, "learning_rate": 0.0002, "epoch": 0.7334428024083196, "step": 670}, {"loss": 1.2617, "grad_norm": 0.4599986970424652, "learning_rate": 0.0002, "epoch": 0.7443897099069513, "step": 680}, {"loss": 1.088, "grad_norm": 0.27292951941490173, "learning_rate": 0.0002, "epoch": 0.7553366174055829, "step": 690}, {"loss": 1.2047, "grad_norm": 0.3123566806316376, "learning_rate": 0.0002, "epoch": 0.7662835249042146, "step": 700}, {"loss": 1.0021, "grad_norm": 0.28310710191726685, "learning_rate": 0.0002, "epoch": 0.7772304324028462, "step": 710}, {"loss": 1.2281, "grad_norm": 0.3279992341995239, "learning_rate": 0.0002, "epoch": 0.7881773399014779, "step": 720}, {"loss": 1.25, "grad_norm": 0.28179168701171875, "learning_rate": 0.0002, "epoch": 0.7991242474001095, "step": 730}, {"loss": 1.0602, "grad_norm": 0.31492987275123596, "learning_rate": 0.0002, "epoch": 0.8100711548987412, "step": 740}, {"loss": 1.2518, "grad_norm": 0.41821011900901794, "learning_rate": 0.0002, "epoch": 0.8210180623973727, "step": 750}, {"loss": 1.1612, "grad_norm": 0.325235515832901, "learning_rate": 0.0002, "epoch": 0.8319649698960043, "step": 760}, {"loss": 1.27, "grad_norm": 0.5366070866584778, "learning_rate": 0.0002, "epoch": 0.842911877394636, "step": 770}, {"loss": 1.0921, "grad_norm": 0.32570579648017883, "learning_rate": 0.0002, "epoch": 0.8538587848932676, "step": 780}, {"loss": 1.1032, "grad_norm": 0.3642968237400055, "learning_rate": 0.0002, "epoch": 0.8648056923918993, "step": 790}, {"loss": 1.1234, "grad_norm": 0.29713448882102966, "learning_rate": 0.0002, "epoch": 0.8757525998905309, "step": 800}, {"loss": 1.0978, "grad_norm": 0.23599444329738617, "learning_rate": 0.0002, "epoch": 0.8866995073891626, "step": 810}, {"loss": 1.1867, "grad_norm": 0.31522464752197266, "learning_rate": 0.0002, "epoch": 0.8976464148877942, "step": 820}, {"loss": 1.0208, "grad_norm": 0.32754790782928467, "learning_rate": 0.0002, "epoch": 0.9085933223864259, "step": 830}, {"loss": 0.9786, "grad_norm": 0.22741089761257172, "learning_rate": 0.0002, "epoch": 0.9195402298850575, "step": 840}, {"loss": 1.0689, "grad_norm": 0.3089679777622223, "learning_rate": 0.0002, "epoch": 0.9304871373836892, "step": 850}, {"loss": 1.0354, "grad_norm": 0.27440521121025085, "learning_rate": 0.0002, "epoch": 0.9414340448823207, "step": 860}, {"loss": 1.0417, "grad_norm": 0.3498363792896271, "learning_rate": 0.0002, "epoch": 0.9523809523809523, "step": 870}, {"loss": 1.269, "grad_norm": 0.47151854634284973, "learning_rate": 0.0002, "epoch": 0.963327859879584, "step": 880}, {"loss": 1.1174, "grad_norm": 0.24756591022014618, "learning_rate": 0.0002, "epoch": 0.9742747673782156, "step": 890}, {"loss": 1.0622, "grad_norm": 0.2600938677787781, "learning_rate": 0.0002, "epoch": 0.9852216748768473, "step": 900}, {"loss": 1.0968, "grad_norm": 0.2934586703777313, "learning_rate": 0.0002, "epoch": 0.9961685823754789, "step": 910}, {"eval_loss": 1.158464789390564, "eval_runtime": 46.0774, "eval_samples_per_second": 9.462, "eval_steps_per_second": 1.194, "epoch": 0.9994526546250684, "step": 913}, {"loss": 1.0322, "grad_norm": 0.37776654958724976, "learning_rate": 0.0002, "epoch": 1.0071154898741106, "step": 920}, {"loss": 1.1346, "grad_norm": 0.31784629821777344, "learning_rate": 0.0002, "epoch": 1.0180623973727423, "step": 930}, {"loss": 0.8976, "grad_norm": 0.24244336783885956, "learning_rate": 0.0002, "epoch": 1.0290093048713738, "step": 940}, {"loss": 1.0172, "grad_norm": 0.3185454308986664, "learning_rate": 0.0002, "epoch": 1.0399562123700055, "step": 950}, {"loss": 1.1645, "grad_norm": 0.3589441478252411, "learning_rate": 0.0002, "epoch": 1.0509031198686372, "step": 960}, {"loss": 1.0317, "grad_norm": 0.38593578338623047, "learning_rate": 0.0002, "epoch": 1.0618500273672686, "step": 970}, {"loss": 0.9913, "grad_norm": 0.39694955945014954, "learning_rate": 0.0002, "epoch": 1.0727969348659003, "step": 980}, {"loss": 1.0893, "grad_norm": 0.469817191362381, "learning_rate": 0.0002, "epoch": 1.083743842364532, "step": 990}, {"loss": 0.983, "grad_norm": 0.2634755074977875, "learning_rate": 0.0002, "epoch": 1.0946907498631637, "step": 1000}, {"loss": 1.0144, "grad_norm": 0.43189436197280884, "learning_rate": 0.0002, "epoch": 1.1056376573617952, "step": 1010}, {"loss": 0.9663, "grad_norm": 0.5559977889060974, "learning_rate": 0.0002, "epoch": 1.116584564860427, "step": 1020}, {"loss": 1.0481, "grad_norm": 0.32100191712379456, "learning_rate": 0.0002, "epoch": 1.1275314723590586, "step": 1030}, {"loss": 1.1012, "grad_norm": 0.40179768204689026, "learning_rate": 0.0002, "epoch": 1.1384783798576903, "step": 1040}, {"loss": 1.0029, "grad_norm": 0.3659493029117584, "learning_rate": 0.0002, "epoch": 1.1494252873563218, "step": 1050}, {"loss": 0.9597, "grad_norm": 0.701704204082489, "learning_rate": 0.0002, "epoch": 1.1603721948549535, "step": 1060}, {"loss": 1.0204, "grad_norm": 0.3650563359260559, "learning_rate": 0.0002, "epoch": 1.1713191023535852, "step": 1070}, {"loss": 0.907, "grad_norm": 0.3191976249217987, "learning_rate": 0.0002, "epoch": 1.1822660098522166, "step": 1080}, {"loss": 1.0648, "grad_norm": 0.3615441918373108, "learning_rate": 0.0002, "epoch": 1.1932129173508483, "step": 1090}, {"loss": 1.0067, "grad_norm": 0.39474231004714966, "learning_rate": 0.0002, "epoch": 1.20415982484948, "step": 1100}, {"loss": 0.9852, "grad_norm": 0.3752822279930115, "learning_rate": 0.0002, "epoch": 1.2151067323481117, "step": 1110}, {"loss": 1.0067, "grad_norm": 0.4165991246700287, "learning_rate": 0.0002, "epoch": 1.2260536398467432, "step": 1120}, {"loss": 1.0244, "grad_norm": 0.5326506495475769, "learning_rate": 0.0002, "epoch": 1.237000547345375, "step": 1130}, {"loss": 1.0542, "grad_norm": 0.48845794796943665, "learning_rate": 0.0002, "epoch": 1.2479474548440066, "step": 1140}, {"loss": 1.0885, "grad_norm": 0.29910150170326233, "learning_rate": 0.0002, "epoch": 1.2588943623426383, "step": 1150}, {"loss": 1.2233, "grad_norm": 0.5069725513458252, "learning_rate": 0.0002, "epoch": 1.2698412698412698, "step": 1160}, {"loss": 1.0992, "grad_norm": 0.29500406980514526, "learning_rate": 0.0002, "epoch": 1.2807881773399015, "step": 1170}, {"loss": 1.0291, "grad_norm": 0.4711538851261139, "learning_rate": 0.0002, "epoch": 1.2917350848385332, "step": 1180}, {"loss": 1.0513, "grad_norm": 0.4203340709209442, "learning_rate": 0.0002, "epoch": 1.3026819923371646, "step": 1190}, {"loss": 1.1274, "grad_norm": 0.36101874709129333, "learning_rate": 0.0002, "epoch": 1.3136288998357963, "step": 1200}, {"loss": 1.085, "grad_norm": 0.4608800411224365, "learning_rate": 0.0002, "epoch": 1.324575807334428, "step": 1210}, {"loss": 1.0695, "grad_norm": 0.6570906639099121, "learning_rate": 0.0002, "epoch": 1.3355227148330597, "step": 1220}, {"loss": 0.9786, "grad_norm": 0.5352164506912231, "learning_rate": 0.0002, "epoch": 1.3464696223316914, "step": 1230}, {"loss": 1.0513, "grad_norm": 0.3885001242160797, "learning_rate": 0.0002, "epoch": 1.357416529830323, "step": 1240}, {"loss": 1.0611, "grad_norm": 0.2987913489341736, "learning_rate": 0.0002, "epoch": 1.3683634373289546, "step": 1250}, {"loss": 1.109, "grad_norm": 0.42070427536964417, "learning_rate": 0.0002, "epoch": 1.3793103448275863, "step": 1260}, {"loss": 1.1092, "grad_norm": 0.5957782864570618, "learning_rate": 0.0002, "epoch": 1.3902572523262178, "step": 1270}, {"loss": 1.0554, "grad_norm": 0.32898882031440735, "learning_rate": 0.0002, "epoch": 1.4012041598248495, "step": 1280}, {"loss": 0.909, "grad_norm": 0.27624452114105225, "learning_rate": 0.0002, "epoch": 1.4121510673234812, "step": 1290}, {"loss": 1.024, "grad_norm": 0.49570828676223755, "learning_rate": 0.0002, "epoch": 1.4230979748221126, "step": 1300}, {"loss": 1.0471, "grad_norm": 0.26191383600234985, "learning_rate": 0.0002, "epoch": 1.4340448823207443, "step": 1310}, {"loss": 1.0788, "grad_norm": 0.35664042830467224, "learning_rate": 0.0002, "epoch": 1.444991789819376, "step": 1320}, {"loss": 1.0108, "grad_norm": 0.45126354694366455, "learning_rate": 0.0002, "epoch": 1.4559386973180077, "step": 1330}, {"loss": 0.9571, "grad_norm": 0.37318357825279236, "learning_rate": 0.0002, "epoch": 1.4668856048166394, "step": 1340}, {"loss": 1.0507, "grad_norm": 0.6428970098495483, "learning_rate": 0.0002, "epoch": 1.477832512315271, "step": 1350}, {"loss": 1.0367, "grad_norm": 0.43256187438964844, "learning_rate": 0.0002, "epoch": 1.4887794198139026, "step": 1360}, {"loss": 1.1321, "grad_norm": 0.5343793630599976, "learning_rate": 0.0002, "epoch": 1.4997263273125343, "step": 1370}, {"loss": 1.1054, "grad_norm": 0.315437376499176, "learning_rate": 0.0002, "epoch": 1.5106732348111658, "step": 1380}, {"loss": 0.8916, "grad_norm": 0.41561153531074524, "learning_rate": 0.0002, "epoch": 1.5216201423097975, "step": 1390}, {"loss": 1.0391, "grad_norm": 0.3201070725917816, "learning_rate": 0.0002, "epoch": 1.5325670498084292, "step": 1400}, {"loss": 1.14, "grad_norm": 0.505537211894989, "learning_rate": 0.0002, "epoch": 1.5435139573070606, "step": 1410}, {"loss": 1.0775, "grad_norm": 0.3747410178184509, "learning_rate": 0.0002, "epoch": 1.5544608648056923, "step": 1420}, {"loss": 1.1171, "grad_norm": 0.49385908246040344, "learning_rate": 0.0002, "epoch": 1.565407772304324, "step": 1430}, {"loss": 1.1182, "grad_norm": 0.49831628799438477, "learning_rate": 0.0002, "epoch": 1.5763546798029555, "step": 1440}, {"loss": 1.0079, "grad_norm": 0.372127890586853, "learning_rate": 0.0002, "epoch": 1.5873015873015874, "step": 1450}, {"loss": 0.9931, "grad_norm": 0.40070840716362, "learning_rate": 0.0002, "epoch": 1.598248494800219, "step": 1460}, {"loss": 0.8954, "grad_norm": 0.34907400608062744, "learning_rate": 0.0002, "epoch": 1.6091954022988506, "step": 1470}, {"loss": 0.9743, "grad_norm": 0.4632418751716614, "learning_rate": 0.0002, "epoch": 1.6201423097974823, "step": 1480}, {"loss": 1.0103, "grad_norm": 0.40164515376091003, "learning_rate": 0.0002, "epoch": 1.6310892172961138, "step": 1490}, {"loss": 0.9523, "grad_norm": 0.3214994966983795, "learning_rate": 0.0002, "epoch": 1.6420361247947455, "step": 1500}, {"loss": 1.0161, "grad_norm": 0.3727897107601166, "learning_rate": 0.0002, "epoch": 1.6529830322933772, "step": 1510}, {"loss": 1.0443, "grad_norm": 0.3817640542984009, "learning_rate": 0.0002, "epoch": 1.6639299397920086, "step": 1520}, {"loss": 1.0511, "grad_norm": 0.5592136979103088, "learning_rate": 0.0002, "epoch": 1.6748768472906403, "step": 1530}, {"loss": 0.9682, "grad_norm": 0.44636598229408264, "learning_rate": 0.0002, "epoch": 1.685823754789272, "step": 1540}, {"loss": 1.033, "grad_norm": 0.40441709756851196, "learning_rate": 0.0002, "epoch": 1.6967706622879035, "step": 1550}, {"loss": 1.0857, "grad_norm": 0.3243522644042969, "learning_rate": 0.0002, "epoch": 1.7077175697865354, "step": 1560}, {"loss": 0.9482, "grad_norm": 0.34277570247650146, "learning_rate": 0.0002, "epoch": 1.718664477285167, "step": 1570}, {"loss": 0.974, "grad_norm": 0.3279995024204254, "learning_rate": 0.0002, "epoch": 1.7296113847837986, "step": 1580}, {"loss": 0.9414, "grad_norm": 0.41968777775764465, "learning_rate": 0.0002, "epoch": 1.7405582922824303, "step": 1590}, {"loss": 0.9768, "grad_norm": 0.39464613795280457, "learning_rate": 0.0002, "epoch": 1.7515051997810618, "step": 1600}, {"loss": 1.0347, "grad_norm": 0.3839009404182434, "learning_rate": 0.0002, "epoch": 1.7624521072796935, "step": 1610}, {"loss": 0.9195, "grad_norm": 0.3250715434551239, "learning_rate": 0.0002, "epoch": 1.7733990147783252, "step": 1620}, {"loss": 1.0049, "grad_norm": 0.5166561007499695, "learning_rate": 0.0002, "epoch": 1.7843459222769567, "step": 1630}, {"loss": 1.0364, "grad_norm": 0.4115183353424072, "learning_rate": 0.0002, "epoch": 1.7952928297755884, "step": 1640}, {"loss": 1.0248, "grad_norm": 0.373780220746994, "learning_rate": 0.0002, "epoch": 1.80623973727422, "step": 1650}, {"loss": 1.0984, "grad_norm": 0.49697014689445496, "learning_rate": 0.0002, "epoch": 1.8171866447728515, "step": 1660}, {"loss": 1.0089, "grad_norm": 1.0308938026428223, "learning_rate": 0.0002, "epoch": 1.8281335522714834, "step": 1670}, {"loss": 1.0853, "grad_norm": 0.4851366877555847, "learning_rate": 0.0002, "epoch": 1.839080459770115, "step": 1680}, {"loss": 0.9533, "grad_norm": 0.3262481391429901, "learning_rate": 0.0002, "epoch": 1.8500273672687466, "step": 1690}, {"loss": 1.048, "grad_norm": 0.6904496550559998, "learning_rate": 0.0002, "epoch": 1.8609742747673783, "step": 1700}, {"loss": 1.0577, "grad_norm": 0.49789851903915405, "learning_rate": 0.0002, "epoch": 1.8719211822660098, "step": 1710}, {"loss": 1.0258, "grad_norm": 0.3035794198513031, "learning_rate": 0.0002, "epoch": 1.8828680897646415, "step": 1720}, {"loss": 0.9916, "grad_norm": 0.4588414430618286, "learning_rate": 0.0002, "epoch": 1.8938149972632732, "step": 1730}, {"loss": 0.9526, "grad_norm": 0.4313034117221832, "learning_rate": 0.0002, "epoch": 1.9047619047619047, "step": 1740}, {"loss": 1.0857, "grad_norm": 0.38562044501304626, "learning_rate": 0.0002, "epoch": 1.9157088122605364, "step": 1750}, {"loss": 0.9763, "grad_norm": 0.46947410702705383, "learning_rate": 0.0002, "epoch": 1.926655719759168, "step": 1760}, {"loss": 0.9967, "grad_norm": 0.3848404884338379, "learning_rate": 0.0002, "epoch": 1.9376026272577995, "step": 1770}, {"loss": 1.0474, "grad_norm": 0.30422744154930115, "learning_rate": 0.0002, "epoch": 1.9485495347564314, "step": 1780}, {"loss": 1.022, "grad_norm": 0.41100990772247314, "learning_rate": 0.0002, "epoch": 1.959496442255063, "step": 1790}, {"loss": 1.003, "grad_norm": 0.3492335081100464, "learning_rate": 0.0002, "epoch": 1.9704433497536946, "step": 1800}, {"loss": 0.9891, "grad_norm": 0.364577978849411, "learning_rate": 0.0002, "epoch": 1.9813902572523263, "step": 1810}, {"loss": 1.0699, "grad_norm": 0.4312075674533844, "learning_rate": 0.0002, "epoch": 1.9923371647509578, "step": 1820}, {"eval_loss": 1.14472496509552, "eval_runtime": 46.0786, "eval_samples_per_second": 9.462, "eval_steps_per_second": 1.194, "epoch": 2.0, "step": 1827}, {"loss": 1.0218, "grad_norm": 0.5989689230918884, "learning_rate": 0.0002, "epoch": 2.0032840722495897, "step": 1830}, {"loss": 0.9384, "grad_norm": 0.49720922112464905, "learning_rate": 0.0002, "epoch": 2.014230979748221, "step": 1840}, {"loss": 0.8482, "grad_norm": 0.42675456404685974, "learning_rate": 0.0002, "epoch": 2.0251778872468527, "step": 1850}, {"loss": 0.8487, "grad_norm": 0.4637208580970764, "learning_rate": 0.0002, "epoch": 2.0361247947454846, "step": 1860}, {"loss": 0.8212, "grad_norm": 0.8329976797103882, "learning_rate": 0.0002, "epoch": 2.047071702244116, "step": 1870}, {"loss": 0.9547, "grad_norm": 0.7869427800178528, "learning_rate": 0.0002, "epoch": 2.0580186097427475, "step": 1880}, {"loss": 0.9351, "grad_norm": 0.4927455186843872, "learning_rate": 0.0002, "epoch": 2.0689655172413794, "step": 1890}, {"loss": 0.8652, "grad_norm": 0.6264246702194214, "learning_rate": 0.0002, "epoch": 2.079912424740011, "step": 1900}, {"loss": 0.9095, "grad_norm": 1.1164122819900513, "learning_rate": 0.0002, "epoch": 2.0908593322386424, "step": 1910}, {"loss": 0.7823, "grad_norm": 0.5283981561660767, "learning_rate": 0.0002, "epoch": 2.1018062397372743, "step": 1920}, {"loss": 0.8065, "grad_norm": 0.45621731877326965, "learning_rate": 0.0002, "epoch": 2.112753147235906, "step": 1930}, {"loss": 0.9184, "grad_norm": 1.381791591644287, "learning_rate": 0.0002, "epoch": 2.1237000547345373, "step": 1940}, {"loss": 0.9006, "grad_norm": 0.5151259899139404, "learning_rate": 0.0002, "epoch": 2.134646962233169, "step": 1950}, {"loss": 0.8436, "grad_norm": 0.9806339740753174, "learning_rate": 0.0002, "epoch": 2.1455938697318007, "step": 1960}, {"loss": 0.8749, "grad_norm": 0.4734154939651489, "learning_rate": 0.0002, "epoch": 2.1565407772304326, "step": 1970}, {"loss": 0.9172, "grad_norm": 0.9553168416023254, "learning_rate": 0.0002, "epoch": 2.167487684729064, "step": 1980}, {"loss": 0.8047, "grad_norm": 0.5895838141441345, "learning_rate": 0.0002, "epoch": 2.1784345922276955, "step": 1990}, {"loss": 0.7841, "grad_norm": 0.4488855302333832, "learning_rate": 0.0002, "epoch": 2.1893814997263275, "step": 2000}, {"loss": 0.8205, "grad_norm": 1.0760235786437988, "learning_rate": 0.0002, "epoch": 2.200328407224959, "step": 2010}, {"loss": 0.7923, "grad_norm": 0.5038785338401794, "learning_rate": 0.0002, "epoch": 2.2112753147235904, "step": 2020}, {"loss": 0.8973, "grad_norm": 0.59819495677948, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 2030}, {"loss": 0.8009, "grad_norm": 0.5012075304985046, "learning_rate": 0.0002, "epoch": 2.233169129720854, "step": 2040}, {"loss": 0.9087, "grad_norm": 0.44978439807891846, "learning_rate": 0.0002, "epoch": 2.2441160372194853, "step": 2050}, {"loss": 0.823, "grad_norm": 0.5350462198257446, "learning_rate": 0.0002, "epoch": 2.255062944718117, "step": 2060}, {"loss": 0.8335, "grad_norm": 0.6020669937133789, "learning_rate": 0.0002, "epoch": 2.2660098522167487, "step": 2070}, {"loss": 0.8023, "grad_norm": 0.5246821045875549, "learning_rate": 0.0002, "epoch": 2.2769567597153806, "step": 2080}, {"loss": 0.8984, "grad_norm": 0.5711268782615662, "learning_rate": 0.0002, "epoch": 2.287903667214012, "step": 2090}, {"loss": 0.9093, "grad_norm": 0.617317259311676, "learning_rate": 0.0002, "epoch": 2.2988505747126435, "step": 2100}, {"loss": 0.8311, "grad_norm": 0.8608947396278381, "learning_rate": 0.0002, "epoch": 2.3097974822112755, "step": 2110}, {"loss": 0.7839, "grad_norm": 0.4739076793193817, "learning_rate": 0.0002, "epoch": 2.320744389709907, "step": 2120}, {"loss": 0.84, "grad_norm": 0.5538856983184814, "learning_rate": 0.0002, "epoch": 2.3316912972085384, "step": 2130}, {"loss": 0.8994, "grad_norm": 0.6064935326576233, "learning_rate": 0.0002, "epoch": 2.3426382047071703, "step": 2140}, {"loss": 0.7765, "grad_norm": 0.5019068121910095, "learning_rate": 0.0002, "epoch": 2.353585112205802, "step": 2150}, {"loss": 0.9576, "grad_norm": 0.45340514183044434, "learning_rate": 0.0002, "epoch": 2.3645320197044333, "step": 2160}, {"loss": 0.9254, "grad_norm": 0.7347203493118286, "learning_rate": 0.0002, "epoch": 2.375478927203065, "step": 2170}, {"loss": 0.8507, "grad_norm": 0.46922534704208374, "learning_rate": 0.0002, "epoch": 2.3864258347016967, "step": 2180}, {"loss": 0.8325, "grad_norm": 0.5507845878601074, "learning_rate": 0.0002, "epoch": 2.3973727422003286, "step": 2190}, {"loss": 0.8178, "grad_norm": 0.5621911883354187, "learning_rate": 0.0002, "epoch": 2.40831964969896, "step": 2200}, {"loss": 0.8569, "grad_norm": 0.5023514032363892, "learning_rate": 0.0002, "epoch": 2.4192665571975915, "step": 2210}, {"loss": 0.8355, "grad_norm": 0.6124861240386963, "learning_rate": 0.0002, "epoch": 2.4302134646962235, "step": 2220}, {"loss": 0.885, "grad_norm": 0.49614205956459045, "learning_rate": 0.0002, "epoch": 2.441160372194855, "step": 2230}, {"loss": 0.8008, "grad_norm": 0.6477900743484497, "learning_rate": 0.0002, "epoch": 2.4521072796934864, "step": 2240}, {"loss": 0.8622, "grad_norm": 0.5868843793869019, "learning_rate": 0.0002, "epoch": 2.4630541871921183, "step": 2250}, {"loss": 0.8498, "grad_norm": 0.4364610016345978, "learning_rate": 0.0002, "epoch": 2.47400109469075, "step": 2260}, {"loss": 0.8378, "grad_norm": 0.5792964696884155, "learning_rate": 0.0002, "epoch": 2.4849480021893813, "step": 2270}, {"loss": 0.8743, "grad_norm": 0.5421269536018372, "learning_rate": 0.0002, "epoch": 2.495894909688013, "step": 2280}, {"loss": 0.9637, "grad_norm": 0.5525493025779724, "learning_rate": 0.0002, "epoch": 2.5068418171866447, "step": 2290}, {"loss": 0.8075, "grad_norm": 0.6463850140571594, "learning_rate": 0.0002, "epoch": 2.5177887246852766, "step": 2300}, {"loss": 0.8591, "grad_norm": 0.6861311793327332, "learning_rate": 0.0002, "epoch": 2.528735632183908, "step": 2310}, {"loss": 0.9287, "grad_norm": 0.5563555359840393, "learning_rate": 0.0002, "epoch": 2.5396825396825395, "step": 2320}, {"loss": 0.945, "grad_norm": 0.5721169114112854, "learning_rate": 0.0002, "epoch": 2.5506294471811715, "step": 2330}, {"loss": 0.8271, "grad_norm": 0.5258274674415588, "learning_rate": 0.0002, "epoch": 2.561576354679803, "step": 2340}, {"loss": 0.8515, "grad_norm": 0.7057380676269531, "learning_rate": 0.0002, "epoch": 2.572523262178435, "step": 2350}, {"loss": 0.8615, "grad_norm": 0.6869027614593506, "learning_rate": 0.0002, "epoch": 2.5834701696770663, "step": 2360}, {"loss": 0.8043, "grad_norm": 0.4960809648036957, "learning_rate": 0.0002, "epoch": 2.594417077175698, "step": 2370}, {"loss": 0.8476, "grad_norm": 0.9288380146026611, "learning_rate": 0.0002, "epoch": 2.6053639846743293, "step": 2380}, {"loss": 0.873, "grad_norm": 0.3765334188938141, "learning_rate": 0.0002, "epoch": 2.616310892172961, "step": 2390}, {"loss": 0.8764, "grad_norm": 0.7487865686416626, "learning_rate": 0.0002, "epoch": 2.6272577996715927, "step": 2400}, {"loss": 0.7577, "grad_norm": 0.6141156554222107, "learning_rate": 0.0002, "epoch": 2.6382047071702246, "step": 2410}, {"loss": 0.8534, "grad_norm": 0.8420507907867432, "learning_rate": 0.0002, "epoch": 2.649151614668856, "step": 2420}, {"loss": 0.8311, "grad_norm": 0.53386390209198, "learning_rate": 0.0002, "epoch": 2.6600985221674875, "step": 2430}, {"loss": 0.8486, "grad_norm": 0.5520607233047485, "learning_rate": 0.0002, "epoch": 2.6710454296661195, "step": 2440}, {"loss": 0.8686, "grad_norm": 0.5337599515914917, "learning_rate": 0.0002, "epoch": 2.681992337164751, "step": 2450}, {"loss": 0.792, "grad_norm": 0.48790836334228516, "learning_rate": 0.0002, "epoch": 2.692939244663383, "step": 2460}, {"loss": 0.7864, "grad_norm": 0.8287786245346069, "learning_rate": 0.0002, "epoch": 2.7038861521620143, "step": 2470}, {"loss": 0.8244, "grad_norm": 0.5876168608665466, "learning_rate": 0.0002, "epoch": 2.714833059660646, "step": 2480}, {"loss": 0.8773, "grad_norm": 0.5206760764122009, "learning_rate": 0.0002, "epoch": 2.7257799671592773, "step": 2490}, {"loss": 0.8097, "grad_norm": 0.5619136691093445, "learning_rate": 0.0002, "epoch": 2.736726874657909, "step": 2500}, {"loss": 0.8377, "grad_norm": 0.5614883899688721, "learning_rate": 0.0002, "epoch": 2.7476737821565407, "step": 2510}, {"loss": 0.8817, "grad_norm": 0.6157700419425964, "learning_rate": 0.0002, "epoch": 2.7586206896551726, "step": 2520}, {"loss": 0.846, "grad_norm": 0.5529953837394714, "learning_rate": 0.0002, "epoch": 2.769567597153804, "step": 2530}, {"loss": 0.8881, "grad_norm": 0.6731224060058594, "learning_rate": 0.0002, "epoch": 2.7805145046524355, "step": 2540}, {"loss": 0.9125, "grad_norm": 0.6960386633872986, "learning_rate": 0.0002, "epoch": 2.7914614121510675, "step": 2550}, {"loss": 0.7823, "grad_norm": 0.5203493237495422, "learning_rate": 0.0002, "epoch": 2.802408319649699, "step": 2560}, {"loss": 0.7951, "grad_norm": 1.036837100982666, "learning_rate": 0.0002, "epoch": 2.813355227148331, "step": 2570}, {"loss": 0.8427, "grad_norm": 0.6125805377960205, "learning_rate": 0.0002, "epoch": 2.8243021346469623, "step": 2580}, {"loss": 0.8939, "grad_norm": 0.6298092603683472, "learning_rate": 0.0002, "epoch": 2.835249042145594, "step": 2590}, {"loss": 0.8845, "grad_norm": 0.5882203578948975, "learning_rate": 0.0002, "epoch": 2.8461959496442253, "step": 2600}, {"loss": 0.8921, "grad_norm": 0.8619399666786194, "learning_rate": 0.0002, "epoch": 2.857142857142857, "step": 2610}, {"loss": 0.8213, "grad_norm": 0.4722687304019928, "learning_rate": 0.0002, "epoch": 2.8680897646414887, "step": 2620}, {"loss": 0.9147, "grad_norm": 0.47399574518203735, "learning_rate": 0.0002, "epoch": 2.8790366721401206, "step": 2630}, {"loss": 0.8325, "grad_norm": 0.5639172792434692, "learning_rate": 0.0002, "epoch": 2.889983579638752, "step": 2640}, {"loss": 0.8653, "grad_norm": 0.4676816761493683, "learning_rate": 0.0002, "epoch": 2.9009304871373836, "step": 2650}, {"loss": 0.8966, "grad_norm": 0.6906291246414185, "learning_rate": 0.0002, "epoch": 2.9118773946360155, "step": 2660}, {"loss": 0.8966, "grad_norm": 0.4369746148586273, "learning_rate": 0.0002, "epoch": 2.922824302134647, "step": 2670}, {"loss": 0.9173, "grad_norm": 0.46423083543777466, "learning_rate": 0.0002, "epoch": 2.933771209633279, "step": 2680}, {"loss": 0.7961, "grad_norm": 0.5700525045394897, "learning_rate": 0.0002, "epoch": 2.9447181171319103, "step": 2690}, {"loss": 0.8172, "grad_norm": 0.6221476793289185, "learning_rate": 0.0002, "epoch": 2.955665024630542, "step": 2700}, {"loss": 0.8538, "grad_norm": 0.6102682948112488, "learning_rate": 0.0002, "epoch": 2.9666119321291733, "step": 2710}, {"loss": 0.9779, "grad_norm": 0.5317878723144531, "learning_rate": 0.0002, "epoch": 2.977558839627805, "step": 2720}, {"loss": 0.9314, "grad_norm": 0.4438510835170746, "learning_rate": 0.0002, "epoch": 2.9885057471264367, "step": 2730}, {"loss": 0.8553, "grad_norm": 0.5022130012512207, "learning_rate": 0.0002, "epoch": 2.9994526546250686, "step": 2740}, {"eval_loss": 1.1722838878631592, "eval_runtime": 46.0829, "eval_samples_per_second": 9.461, "eval_steps_per_second": 1.194, "epoch": 2.9994526546250686, "step": 2740}, {"loss": 0.6443, "grad_norm": 0.6384502053260803, "learning_rate": 0.0002, "epoch": 3.0103995621237, "step": 2750}, {"loss": 0.7123, "grad_norm": 0.9928722381591797, "learning_rate": 0.0002, "epoch": 3.0213464696223316, "step": 2760}, {"loss": 0.6045, "grad_norm": 0.7813051342964172, "learning_rate": 0.0002, "epoch": 3.0322933771209635, "step": 2770}, {"loss": 0.6042, "grad_norm": 1.0202556848526, "learning_rate": 0.0002, "epoch": 3.043240284619595, "step": 2780}, {"loss": 0.6356, "grad_norm": 0.7581062316894531, "learning_rate": 0.0002, "epoch": 3.0541871921182264, "step": 2790}, {"loss": 0.6349, "grad_norm": 0.6252710223197937, "learning_rate": 0.0002, "epoch": 3.0651340996168583, "step": 2800}, {"loss": 0.645, "grad_norm": 0.7738662958145142, "learning_rate": 0.0002, "epoch": 3.07608100711549, "step": 2810}, {"loss": 0.627, "grad_norm": 0.7381885051727295, "learning_rate": 0.0002, "epoch": 3.0870279146141213, "step": 2820}, {"loss": 0.6371, "grad_norm": 0.9197564721107483, "learning_rate": 0.0002, "epoch": 3.097974822112753, "step": 2830}, {"loss": 0.723, "grad_norm": 1.000976800918579, "learning_rate": 0.0002, "epoch": 3.1089217296113847, "step": 2840}, {"loss": 0.6631, "grad_norm": 0.7559131383895874, "learning_rate": 0.0002, "epoch": 3.1198686371100166, "step": 2850}, {"loss": 0.6252, "grad_norm": 0.7213780879974365, "learning_rate": 0.0002, "epoch": 3.130815544608648, "step": 2860}, {"loss": 0.6501, "grad_norm": 0.945939838886261, "learning_rate": 0.0002, "epoch": 3.1417624521072796, "step": 2870}, {"loss": 0.6129, "grad_norm": 0.7277454137802124, "learning_rate": 0.0002, "epoch": 3.1527093596059115, "step": 2880}, {"loss": 0.6423, "grad_norm": 0.762026846408844, "learning_rate": 0.0002, "epoch": 3.163656267104543, "step": 2890}, {"loss": 0.5332, "grad_norm": 0.6471221446990967, "learning_rate": 0.0002, "epoch": 3.1746031746031744, "step": 2900}, {"loss": 0.7981, "grad_norm": 0.6018978357315063, "learning_rate": 0.0002, "epoch": 3.1855500821018063, "step": 2910}, {"loss": 0.7274, "grad_norm": 0.8607320785522461, "learning_rate": 0.0002, "epoch": 3.196496989600438, "step": 2920}, {"loss": 0.6139, "grad_norm": 0.8854126334190369, "learning_rate": 0.0002, "epoch": 3.2074438970990693, "step": 2930}, {"loss": 0.6485, "grad_norm": 0.6620870232582092, "learning_rate": 0.0002, "epoch": 3.218390804597701, "step": 2940}, {"loss": 0.6969, "grad_norm": 0.7377511858940125, "learning_rate": 0.0002, "epoch": 3.2293377120963327, "step": 2950}, {"loss": 0.6798, "grad_norm": 0.7803301811218262, "learning_rate": 0.0002, "epoch": 3.2402846195949646, "step": 2960}, {"loss": 0.6697, "grad_norm": 0.834061861038208, "learning_rate": 0.0002, "epoch": 3.251231527093596, "step": 2970}, {"loss": 0.6894, "grad_norm": 0.8496041893959045, "learning_rate": 0.0002, "epoch": 3.2621784345922276, "step": 2980}, {"loss": 0.6591, "grad_norm": 0.7967984676361084, "learning_rate": 0.0002, "epoch": 3.2731253420908595, "step": 2990}, {"loss": 0.7266, "grad_norm": 1.0207016468048096, "learning_rate": 0.0002, "epoch": 3.284072249589491, "step": 3000}, {"loss": 0.6586, "grad_norm": 0.7855771780014038, "learning_rate": 0.0002, "epoch": 3.2950191570881224, "step": 3010}, {"loss": 0.5711, "grad_norm": 0.9427546858787537, "learning_rate": 0.0002, "epoch": 3.3059660645867543, "step": 3020}, {"loss": 0.6277, "grad_norm": 0.823542594909668, "learning_rate": 0.0002, "epoch": 3.316912972085386, "step": 3030}, {"loss": 0.7109, "grad_norm": 0.9826635122299194, "learning_rate": 0.0002, "epoch": 3.3278598795840173, "step": 3040}, {"loss": 0.6564, "grad_norm": 0.7259827852249146, "learning_rate": 0.0002, "epoch": 3.338806787082649, "step": 3050}, {"loss": 0.653, "grad_norm": 0.7774739861488342, "learning_rate": 0.0002, "epoch": 3.3497536945812807, "step": 3060}, {"loss": 0.7529, "grad_norm": 0.7394293546676636, "learning_rate": 0.0002, "epoch": 3.3607006020799126, "step": 3070}, {"loss": 0.5987, "grad_norm": 0.9017578959465027, "learning_rate": 0.0002, "epoch": 3.371647509578544, "step": 3080}, {"loss": 0.6953, "grad_norm": 0.7451054453849792, "learning_rate": 0.0002, "epoch": 3.3825944170771756, "step": 3090}, {"loss": 0.6759, "grad_norm": 0.7321506142616272, "learning_rate": 0.0002, "epoch": 3.3935413245758075, "step": 3100}, {"loss": 0.6555, "grad_norm": 0.6721828579902649, "learning_rate": 0.0002, "epoch": 3.404488232074439, "step": 3110}, {"loss": 0.6559, "grad_norm": 0.774022102355957, "learning_rate": 0.0002, "epoch": 3.4154351395730704, "step": 3120}, {"loss": 0.7449, "grad_norm": 0.9143537282943726, "learning_rate": 0.0002, "epoch": 3.4263820470717024, "step": 3130}, {"loss": 0.6899, "grad_norm": 1.226087212562561, "learning_rate": 0.0002, "epoch": 3.437328954570334, "step": 3140}, {"loss": 0.6719, "grad_norm": 0.7545496225357056, "learning_rate": 0.0002, "epoch": 3.4482758620689653, "step": 3150}, {"loss": 0.6153, "grad_norm": 0.6515635848045349, "learning_rate": 0.0002, "epoch": 3.4592227695675972, "step": 3160}, {"loss": 0.6926, "grad_norm": 0.9297090172767639, "learning_rate": 0.0002, "epoch": 3.4701696770662287, "step": 3170}, {"loss": 0.6071, "grad_norm": 1.0130730867385864, "learning_rate": 0.0002, "epoch": 3.4811165845648606, "step": 3180}, {"loss": 0.5959, "grad_norm": 0.7654589414596558, "learning_rate": 0.0002, "epoch": 3.492063492063492, "step": 3190}, {"loss": 0.7401, "grad_norm": 0.9954977631568909, "learning_rate": 0.0002, "epoch": 3.5030103995621236, "step": 3200}, {"loss": 0.6661, "grad_norm": 0.6027487516403198, "learning_rate": 0.0002, "epoch": 3.5139573070607555, "step": 3210}, {"loss": 0.6963, "grad_norm": 0.741770327091217, "learning_rate": 0.0002, "epoch": 3.524904214559387, "step": 3220}, {"loss": 0.8112, "grad_norm": 1.0534909963607788, "learning_rate": 0.0002, "epoch": 3.535851122058019, "step": 3230}, {"loss": 0.6813, "grad_norm": 0.937772274017334, "learning_rate": 0.0002, "epoch": 3.5467980295566504, "step": 3240}, {"loss": 0.6681, "grad_norm": 0.8504213690757751, "learning_rate": 0.0002, "epoch": 3.557744937055282, "step": 3250}, {"loss": 0.6436, "grad_norm": 0.7755007147789001, "learning_rate": 0.0002, "epoch": 3.5686918445539133, "step": 3260}, {"loss": 0.6213, "grad_norm": 1.0193358659744263, "learning_rate": 0.0002, "epoch": 3.5796387520525452, "step": 3270}, {"loss": 0.671, "grad_norm": 0.8440536856651306, "learning_rate": 0.0002, "epoch": 3.5905856595511767, "step": 3280}, {"loss": 0.6859, "grad_norm": 0.6195939183235168, "learning_rate": 0.0002, "epoch": 3.6015325670498086, "step": 3290}, {"loss": 0.7446, "grad_norm": 0.8608590960502625, "learning_rate": 0.0002, "epoch": 3.61247947454844, "step": 3300}, {"loss": 0.7301, "grad_norm": 0.6772327423095703, "learning_rate": 0.0002, "epoch": 3.6234263820470716, "step": 3310}, {"loss": 0.6298, "grad_norm": 0.8031839728355408, "learning_rate": 0.0002, "epoch": 3.6343732895457035, "step": 3320}, {"loss": 0.7041, "grad_norm": 0.6080502271652222, "learning_rate": 0.0002, "epoch": 3.645320197044335, "step": 3330}, {"loss": 0.7431, "grad_norm": 0.8007240891456604, "learning_rate": 0.0002, "epoch": 3.656267104542967, "step": 3340}, {"loss": 0.7446, "grad_norm": 0.8060704469680786, "learning_rate": 0.0002, "epoch": 3.6672140120415984, "step": 3350}, {"loss": 0.6304, "grad_norm": 0.7547586560249329, "learning_rate": 0.0002, "epoch": 3.67816091954023, "step": 3360}, {"loss": 0.7066, "grad_norm": 0.686851978302002, "learning_rate": 0.0002, "epoch": 3.6891078270388613, "step": 3370}, {"loss": 0.6748, "grad_norm": 0.9429075717926025, "learning_rate": 0.0002, "epoch": 3.7000547345374932, "step": 3380}, {"loss": 0.6673, "grad_norm": 0.7283591032028198, "learning_rate": 0.0002, "epoch": 3.7110016420361247, "step": 3390}, {"loss": 0.7502, "grad_norm": 0.8323085904121399, "learning_rate": 0.0002, "epoch": 3.7219485495347566, "step": 3400}, {"loss": 0.7779, "grad_norm": 0.8529590964317322, "learning_rate": 0.0002, "epoch": 3.732895457033388, "step": 3410}, {"loss": 0.6555, "grad_norm": 0.731752872467041, "learning_rate": 0.0002, "epoch": 3.7438423645320196, "step": 3420}, {"loss": 0.6928, "grad_norm": 0.8572278618812561, "learning_rate": 0.0002, "epoch": 3.7547892720306515, "step": 3430}, {"loss": 0.6215, "grad_norm": 0.7408691048622131, "learning_rate": 0.0002, "epoch": 3.765736179529283, "step": 3440}, {"loss": 0.622, "grad_norm": 0.7470445036888123, "learning_rate": 0.0002, "epoch": 3.776683087027915, "step": 3450}, {"loss": 0.7241, "grad_norm": 0.6806244254112244, "learning_rate": 0.0002, "epoch": 3.7876299945265464, "step": 3460}, {"loss": 0.7739, "grad_norm": 0.9129069447517395, "learning_rate": 0.0002, "epoch": 3.798576902025178, "step": 3470}, {"loss": 0.6826, "grad_norm": 0.8717501759529114, "learning_rate": 0.0002, "epoch": 3.8095238095238093, "step": 3480}, {"loss": 0.6188, "grad_norm": 0.6761979460716248, "learning_rate": 0.0002, "epoch": 3.8204707170224412, "step": 3490}, {"loss": 0.7601, "grad_norm": 1.0054380893707275, "learning_rate": 0.0002, "epoch": 3.8314176245210727, "step": 3500}, {"loss": 0.623, "grad_norm": 1.1224009990692139, "learning_rate": 0.0002, "epoch": 3.8423645320197046, "step": 3510}, {"loss": 0.6918, "grad_norm": 0.8997692465782166, "learning_rate": 0.0002, "epoch": 3.853311439518336, "step": 3520}, {"loss": 0.6357, "grad_norm": 1.0086902379989624, "learning_rate": 0.0002, "epoch": 3.8642583470169676, "step": 3530}, {"loss": 0.6379, "grad_norm": 0.772739589214325, "learning_rate": 0.0002, "epoch": 3.8752052545155995, "step": 3540}, {"loss": 0.7423, "grad_norm": 1.211774230003357, "learning_rate": 0.0002, "epoch": 3.886152162014231, "step": 3550}, {"loss": 0.7321, "grad_norm": 0.9572356939315796, "learning_rate": 0.0002, "epoch": 3.897099069512863, "step": 3560}, {"loss": 0.6836, "grad_norm": 0.7887842655181885, "learning_rate": 0.0002, "epoch": 3.9080459770114944, "step": 3570}, {"loss": 0.7576, "grad_norm": 0.7308389544487, "learning_rate": 0.0002, "epoch": 3.918992884510126, "step": 3580}, {"loss": 0.6001, "grad_norm": 1.0182650089263916, "learning_rate": 0.0002, "epoch": 3.9299397920087573, "step": 3590}, {"loss": 0.6942, "grad_norm": 0.8000147342681885, "learning_rate": 0.0002, "epoch": 3.9408866995073892, "step": 3600}, {"loss": 0.6244, "grad_norm": 0.7385728359222412, "learning_rate": 0.0002, "epoch": 3.9518336070060207, "step": 3610}, {"loss": 0.6718, "grad_norm": 0.9233261942863464, "learning_rate": 0.0002, "epoch": 3.9627805145046526, "step": 3620}, {"loss": 0.6508, "grad_norm": 0.8486751914024353, "learning_rate": 0.0002, "epoch": 3.973727422003284, "step": 3630}, {"loss": 0.6928, "grad_norm": 0.7593663334846497, "learning_rate": 0.0002, "epoch": 3.9846743295019156, "step": 3640}, {"loss": 0.6847, "grad_norm": 0.7885415554046631, "learning_rate": 0.0002, "epoch": 3.9956212370005475, "step": 3650}]} +{"epoch": 4.999452654625069, "step": 4567, "epoch_duration": 1358.2625460624695, "total_accumulated_duration": 6811.380892276764, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4039, "grad_norm": 0.7611560821533203, "learning_rate": 0.0002, "epoch": 0.010946907498631636, "step": 10}, {"loss": 1.8818, "grad_norm": 0.4633193612098694, "learning_rate": 0.0002, "epoch": 0.021893814997263273, "step": 20}, {"loss": 1.5927, "grad_norm": 0.49326154589653015, "learning_rate": 0.0002, "epoch": 0.03284072249589491, "step": 30}, {"loss": 1.3859, "grad_norm": 0.3943138122558594, "learning_rate": 0.0002, "epoch": 0.043787629994526546, "step": 40}, {"loss": 1.3036, "grad_norm": 0.43292930722236633, "learning_rate": 0.0002, "epoch": 0.05473453749315818, "step": 50}, {"loss": 1.2427, "grad_norm": 0.3431817591190338, "learning_rate": 0.0002, "epoch": 0.06568144499178982, "step": 60}, {"loss": 1.3076, "grad_norm": 0.38011446595191956, "learning_rate": 0.0002, "epoch": 0.07662835249042145, "step": 70}, {"loss": 1.1641, "grad_norm": 0.7441071271896362, "learning_rate": 0.0002, "epoch": 0.08757525998905309, "step": 80}, {"loss": 1.1721, "grad_norm": 0.3359833061695099, "learning_rate": 0.0002, "epoch": 0.09852216748768473, "step": 90}, {"loss": 1.2774, "grad_norm": 0.3724392354488373, "learning_rate": 0.0002, "epoch": 0.10946907498631636, "step": 100}, {"loss": 1.216, "grad_norm": 0.40673762559890747, "learning_rate": 0.0002, "epoch": 0.120415982484948, "step": 110}, {"loss": 1.2611, "grad_norm": 0.40036800503730774, "learning_rate": 0.0002, "epoch": 0.13136288998357964, "step": 120}, {"loss": 1.2436, "grad_norm": 2.844191312789917, "learning_rate": 0.0002, "epoch": 0.1423097974822113, "step": 130}, {"loss": 1.2254, "grad_norm": 0.3104734420776367, "learning_rate": 0.0002, "epoch": 0.1532567049808429, "step": 140}, {"loss": 1.0498, "grad_norm": 0.3266797959804535, "learning_rate": 0.0002, "epoch": 0.16420361247947454, "step": 150}, {"loss": 1.1644, "grad_norm": 0.3079199194908142, "learning_rate": 0.0002, "epoch": 0.17515051997810618, "step": 160}, {"loss": 1.2923, "grad_norm": 0.3872479498386383, "learning_rate": 0.0002, "epoch": 0.18609742747673783, "step": 170}, {"loss": 1.1809, "grad_norm": 0.38654500246047974, "learning_rate": 0.0002, "epoch": 0.19704433497536947, "step": 180}, {"loss": 1.066, "grad_norm": 0.2913552522659302, "learning_rate": 0.0002, "epoch": 0.20799124247400108, "step": 190}, {"loss": 1.0868, "grad_norm": 0.2960572838783264, "learning_rate": 0.0002, "epoch": 0.21893814997263272, "step": 200}, {"loss": 1.136, "grad_norm": 0.5175501108169556, "learning_rate": 0.0002, "epoch": 0.22988505747126436, "step": 210}, {"loss": 1.1556, "grad_norm": 1.2921574115753174, "learning_rate": 0.0002, "epoch": 0.240831964969896, "step": 220}, {"loss": 1.1488, "grad_norm": 0.2675512135028839, "learning_rate": 0.0002, "epoch": 0.25177887246852765, "step": 230}, {"loss": 1.2764, "grad_norm": 0.3956190049648285, "learning_rate": 0.0002, "epoch": 0.2627257799671593, "step": 240}, {"loss": 1.1889, "grad_norm": 0.6022581458091736, "learning_rate": 0.0002, "epoch": 0.27367268746579093, "step": 250}, {"loss": 1.1981, "grad_norm": 1.1949563026428223, "learning_rate": 0.0002, "epoch": 0.2846195949644226, "step": 260}, {"loss": 1.1877, "grad_norm": 0.31173548102378845, "learning_rate": 0.0002, "epoch": 0.2955665024630542, "step": 270}, {"loss": 1.06, "grad_norm": 0.2808472812175751, "learning_rate": 0.0002, "epoch": 0.3065134099616858, "step": 280}, {"loss": 1.1752, "grad_norm": 0.3042023777961731, "learning_rate": 0.0002, "epoch": 0.31746031746031744, "step": 290}, {"loss": 1.3147, "grad_norm": 0.39915043115615845, "learning_rate": 0.0002, "epoch": 0.3284072249589491, "step": 300}, {"loss": 1.2425, "grad_norm": 0.39118197560310364, "learning_rate": 0.0002, "epoch": 0.3393541324575807, "step": 310}, {"loss": 1.1363, "grad_norm": 0.355010986328125, "learning_rate": 0.0002, "epoch": 0.35030103995621237, "step": 320}, {"loss": 1.1925, "grad_norm": 0.29734086990356445, "learning_rate": 0.0002, "epoch": 0.361247947454844, "step": 330}, {"loss": 1.1974, "grad_norm": 0.346096009016037, "learning_rate": 0.0002, "epoch": 0.37219485495347565, "step": 340}, {"loss": 1.1641, "grad_norm": 0.4829643666744232, "learning_rate": 0.0002, "epoch": 0.3831417624521073, "step": 350}, {"loss": 1.2808, "grad_norm": 0.4726872742176056, "learning_rate": 0.0002, "epoch": 0.39408866995073893, "step": 360}, {"loss": 1.1532, "grad_norm": 0.3130153715610504, "learning_rate": 0.0002, "epoch": 0.4050355774493706, "step": 370}, {"loss": 1.1842, "grad_norm": 0.5123590230941772, "learning_rate": 0.0002, "epoch": 0.41598248494800216, "step": 380}, {"loss": 1.1539, "grad_norm": 0.3444574773311615, "learning_rate": 0.0002, "epoch": 0.4269293924466338, "step": 390}, {"loss": 1.1756, "grad_norm": 0.5302175283432007, "learning_rate": 0.0002, "epoch": 0.43787629994526545, "step": 400}, {"loss": 1.1138, "grad_norm": 0.2713572680950165, "learning_rate": 0.0002, "epoch": 0.4488232074438971, "step": 410}, {"loss": 1.1281, "grad_norm": 0.29530611634254456, "learning_rate": 0.0002, "epoch": 0.45977011494252873, "step": 420}, {"loss": 1.1721, "grad_norm": 0.27282455563545227, "learning_rate": 0.0002, "epoch": 0.47071702244116037, "step": 430}, {"loss": 1.1213, "grad_norm": 0.2647949755191803, "learning_rate": 0.0002, "epoch": 0.481663929939792, "step": 440}, {"loss": 1.1656, "grad_norm": 0.35509347915649414, "learning_rate": 0.0002, "epoch": 0.49261083743842365, "step": 450}, {"loss": 1.1251, "grad_norm": 0.1959609091281891, "learning_rate": 0.0002, "epoch": 0.5035577449370553, "step": 460}, {"loss": 1.1889, "grad_norm": 0.40090155601501465, "learning_rate": 0.0002, "epoch": 0.5145046524356869, "step": 470}, {"loss": 1.156, "grad_norm": 0.3354604244232178, "learning_rate": 0.0002, "epoch": 0.5254515599343186, "step": 480}, {"loss": 1.1024, "grad_norm": 0.2758506238460541, "learning_rate": 0.0002, "epoch": 0.5363984674329502, "step": 490}, {"loss": 1.1108, "grad_norm": 0.3629051744937897, "learning_rate": 0.0002, "epoch": 0.5473453749315819, "step": 500}, {"loss": 1.2236, "grad_norm": 0.30802229046821594, "learning_rate": 0.0002, "epoch": 0.5582922824302134, "step": 510}, {"loss": 1.0424, "grad_norm": 0.3099463880062103, "learning_rate": 0.0002, "epoch": 0.5692391899288451, "step": 520}, {"loss": 1.255, "grad_norm": 0.42299067974090576, "learning_rate": 0.0002, "epoch": 0.5801860974274767, "step": 530}, {"loss": 1.1698, "grad_norm": 0.5392252802848816, "learning_rate": 0.0002, "epoch": 0.5911330049261084, "step": 540}, {"loss": 1.1171, "grad_norm": 0.34768250584602356, "learning_rate": 0.0002, "epoch": 0.60207991242474, "step": 550}, {"loss": 1.102, "grad_norm": 0.28490015864372253, "learning_rate": 0.0002, "epoch": 0.6130268199233716, "step": 560}, {"loss": 1.1111, "grad_norm": 0.34787994623184204, "learning_rate": 0.0002, "epoch": 0.6239737274220033, "step": 570}, {"loss": 1.0759, "grad_norm": 0.29058772325515747, "learning_rate": 0.0002, "epoch": 0.6349206349206349, "step": 580}, {"loss": 1.1157, "grad_norm": 0.4063778817653656, "learning_rate": 0.0002, "epoch": 0.6458675424192666, "step": 590}, {"loss": 1.1432, "grad_norm": 0.9244267344474792, "learning_rate": 0.0002, "epoch": 0.6568144499178982, "step": 600}, {"loss": 1.0591, "grad_norm": 0.27605190873146057, "learning_rate": 0.0002, "epoch": 0.6677613574165299, "step": 610}, {"loss": 1.2123, "grad_norm": 0.34346821904182434, "learning_rate": 0.0002, "epoch": 0.6787082649151615, "step": 620}, {"loss": 1.2195, "grad_norm": 0.3093279302120209, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 630}, {"loss": 1.2461, "grad_norm": 1.0069009065628052, "learning_rate": 0.0002, "epoch": 0.7006020799124247, "step": 640}, {"loss": 1.0892, "grad_norm": 0.5049130916595459, "learning_rate": 0.0002, "epoch": 0.7115489874110563, "step": 650}, {"loss": 1.1404, "grad_norm": 0.3748924732208252, "learning_rate": 0.0002, "epoch": 0.722495894909688, "step": 660}, {"loss": 1.1062, "grad_norm": 0.2964959144592285, "learning_rate": 0.0002, "epoch": 0.7334428024083196, "step": 670}, {"loss": 1.2617, "grad_norm": 0.4599986970424652, "learning_rate": 0.0002, "epoch": 0.7443897099069513, "step": 680}, {"loss": 1.088, "grad_norm": 0.27292951941490173, "learning_rate": 0.0002, "epoch": 0.7553366174055829, "step": 690}, {"loss": 1.2047, "grad_norm": 0.3123566806316376, "learning_rate": 0.0002, "epoch": 0.7662835249042146, "step": 700}, {"loss": 1.0021, "grad_norm": 0.28310710191726685, "learning_rate": 0.0002, "epoch": 0.7772304324028462, "step": 710}, {"loss": 1.2281, "grad_norm": 0.3279992341995239, "learning_rate": 0.0002, "epoch": 0.7881773399014779, "step": 720}, {"loss": 1.25, "grad_norm": 0.28179168701171875, "learning_rate": 0.0002, "epoch": 0.7991242474001095, "step": 730}, {"loss": 1.0602, "grad_norm": 0.31492987275123596, "learning_rate": 0.0002, "epoch": 0.8100711548987412, "step": 740}, {"loss": 1.2518, "grad_norm": 0.41821011900901794, "learning_rate": 0.0002, "epoch": 0.8210180623973727, "step": 750}, {"loss": 1.1612, "grad_norm": 0.325235515832901, "learning_rate": 0.0002, "epoch": 0.8319649698960043, "step": 760}, {"loss": 1.27, "grad_norm": 0.5366070866584778, "learning_rate": 0.0002, "epoch": 0.842911877394636, "step": 770}, {"loss": 1.0921, "grad_norm": 0.32570579648017883, "learning_rate": 0.0002, "epoch": 0.8538587848932676, "step": 780}, {"loss": 1.1032, "grad_norm": 0.3642968237400055, "learning_rate": 0.0002, "epoch": 0.8648056923918993, "step": 790}, {"loss": 1.1234, "grad_norm": 0.29713448882102966, "learning_rate": 0.0002, "epoch": 0.8757525998905309, "step": 800}, {"loss": 1.0978, "grad_norm": 0.23599444329738617, "learning_rate": 0.0002, "epoch": 0.8866995073891626, "step": 810}, {"loss": 1.1867, "grad_norm": 0.31522464752197266, "learning_rate": 0.0002, "epoch": 0.8976464148877942, "step": 820}, {"loss": 1.0208, "grad_norm": 0.32754790782928467, "learning_rate": 0.0002, "epoch": 0.9085933223864259, "step": 830}, {"loss": 0.9786, "grad_norm": 0.22741089761257172, "learning_rate": 0.0002, "epoch": 0.9195402298850575, "step": 840}, {"loss": 1.0689, "grad_norm": 0.3089679777622223, "learning_rate": 0.0002, "epoch": 0.9304871373836892, "step": 850}, {"loss": 1.0354, "grad_norm": 0.27440521121025085, "learning_rate": 0.0002, "epoch": 0.9414340448823207, "step": 860}, {"loss": 1.0417, "grad_norm": 0.3498363792896271, "learning_rate": 0.0002, "epoch": 0.9523809523809523, "step": 870}, {"loss": 1.269, "grad_norm": 0.47151854634284973, "learning_rate": 0.0002, "epoch": 0.963327859879584, "step": 880}, {"loss": 1.1174, "grad_norm": 0.24756591022014618, "learning_rate": 0.0002, "epoch": 0.9742747673782156, "step": 890}, {"loss": 1.0622, "grad_norm": 0.2600938677787781, "learning_rate": 0.0002, "epoch": 0.9852216748768473, "step": 900}, {"loss": 1.0968, "grad_norm": 0.2934586703777313, "learning_rate": 0.0002, "epoch": 0.9961685823754789, "step": 910}, {"eval_loss": 1.158464789390564, "eval_runtime": 46.0774, "eval_samples_per_second": 9.462, "eval_steps_per_second": 1.194, "epoch": 0.9994526546250684, "step": 913}, {"loss": 1.0322, "grad_norm": 0.37776654958724976, "learning_rate": 0.0002, "epoch": 1.0071154898741106, "step": 920}, {"loss": 1.1346, "grad_norm": 0.31784629821777344, "learning_rate": 0.0002, "epoch": 1.0180623973727423, "step": 930}, {"loss": 0.8976, "grad_norm": 0.24244336783885956, "learning_rate": 0.0002, "epoch": 1.0290093048713738, "step": 940}, {"loss": 1.0172, "grad_norm": 0.3185454308986664, "learning_rate": 0.0002, "epoch": 1.0399562123700055, "step": 950}, {"loss": 1.1645, "grad_norm": 0.3589441478252411, "learning_rate": 0.0002, "epoch": 1.0509031198686372, "step": 960}, {"loss": 1.0317, "grad_norm": 0.38593578338623047, "learning_rate": 0.0002, "epoch": 1.0618500273672686, "step": 970}, {"loss": 0.9913, "grad_norm": 0.39694955945014954, "learning_rate": 0.0002, "epoch": 1.0727969348659003, "step": 980}, {"loss": 1.0893, "grad_norm": 0.469817191362381, "learning_rate": 0.0002, "epoch": 1.083743842364532, "step": 990}, {"loss": 0.983, "grad_norm": 0.2634755074977875, "learning_rate": 0.0002, "epoch": 1.0946907498631637, "step": 1000}, {"loss": 1.0144, "grad_norm": 0.43189436197280884, "learning_rate": 0.0002, "epoch": 1.1056376573617952, "step": 1010}, {"loss": 0.9663, "grad_norm": 0.5559977889060974, "learning_rate": 0.0002, "epoch": 1.116584564860427, "step": 1020}, {"loss": 1.0481, "grad_norm": 0.32100191712379456, "learning_rate": 0.0002, "epoch": 1.1275314723590586, "step": 1030}, {"loss": 1.1012, "grad_norm": 0.40179768204689026, "learning_rate": 0.0002, "epoch": 1.1384783798576903, "step": 1040}, {"loss": 1.0029, "grad_norm": 0.3659493029117584, "learning_rate": 0.0002, "epoch": 1.1494252873563218, "step": 1050}, {"loss": 0.9597, "grad_norm": 0.701704204082489, "learning_rate": 0.0002, "epoch": 1.1603721948549535, "step": 1060}, {"loss": 1.0204, "grad_norm": 0.3650563359260559, "learning_rate": 0.0002, "epoch": 1.1713191023535852, "step": 1070}, {"loss": 0.907, "grad_norm": 0.3191976249217987, "learning_rate": 0.0002, "epoch": 1.1822660098522166, "step": 1080}, {"loss": 1.0648, "grad_norm": 0.3615441918373108, "learning_rate": 0.0002, "epoch": 1.1932129173508483, "step": 1090}, {"loss": 1.0067, "grad_norm": 0.39474231004714966, "learning_rate": 0.0002, "epoch": 1.20415982484948, "step": 1100}, {"loss": 0.9852, "grad_norm": 0.3752822279930115, "learning_rate": 0.0002, "epoch": 1.2151067323481117, "step": 1110}, {"loss": 1.0067, "grad_norm": 0.4165991246700287, "learning_rate": 0.0002, "epoch": 1.2260536398467432, "step": 1120}, {"loss": 1.0244, "grad_norm": 0.5326506495475769, "learning_rate": 0.0002, "epoch": 1.237000547345375, "step": 1130}, {"loss": 1.0542, "grad_norm": 0.48845794796943665, "learning_rate": 0.0002, "epoch": 1.2479474548440066, "step": 1140}, {"loss": 1.0885, "grad_norm": 0.29910150170326233, "learning_rate": 0.0002, "epoch": 1.2588943623426383, "step": 1150}, {"loss": 1.2233, "grad_norm": 0.5069725513458252, "learning_rate": 0.0002, "epoch": 1.2698412698412698, "step": 1160}, {"loss": 1.0992, "grad_norm": 0.29500406980514526, "learning_rate": 0.0002, "epoch": 1.2807881773399015, "step": 1170}, {"loss": 1.0291, "grad_norm": 0.4711538851261139, "learning_rate": 0.0002, "epoch": 1.2917350848385332, "step": 1180}, {"loss": 1.0513, "grad_norm": 0.4203340709209442, "learning_rate": 0.0002, "epoch": 1.3026819923371646, "step": 1190}, {"loss": 1.1274, "grad_norm": 0.36101874709129333, "learning_rate": 0.0002, "epoch": 1.3136288998357963, "step": 1200}, {"loss": 1.085, "grad_norm": 0.4608800411224365, "learning_rate": 0.0002, "epoch": 1.324575807334428, "step": 1210}, {"loss": 1.0695, "grad_norm": 0.6570906639099121, "learning_rate": 0.0002, "epoch": 1.3355227148330597, "step": 1220}, {"loss": 0.9786, "grad_norm": 0.5352164506912231, "learning_rate": 0.0002, "epoch": 1.3464696223316914, "step": 1230}, {"loss": 1.0513, "grad_norm": 0.3885001242160797, "learning_rate": 0.0002, "epoch": 1.357416529830323, "step": 1240}, {"loss": 1.0611, "grad_norm": 0.2987913489341736, "learning_rate": 0.0002, "epoch": 1.3683634373289546, "step": 1250}, {"loss": 1.109, "grad_norm": 0.42070427536964417, "learning_rate": 0.0002, "epoch": 1.3793103448275863, "step": 1260}, {"loss": 1.1092, "grad_norm": 0.5957782864570618, "learning_rate": 0.0002, "epoch": 1.3902572523262178, "step": 1270}, {"loss": 1.0554, "grad_norm": 0.32898882031440735, "learning_rate": 0.0002, "epoch": 1.4012041598248495, "step": 1280}, {"loss": 0.909, "grad_norm": 0.27624452114105225, "learning_rate": 0.0002, "epoch": 1.4121510673234812, "step": 1290}, {"loss": 1.024, "grad_norm": 0.49570828676223755, "learning_rate": 0.0002, "epoch": 1.4230979748221126, "step": 1300}, {"loss": 1.0471, "grad_norm": 0.26191383600234985, "learning_rate": 0.0002, "epoch": 1.4340448823207443, "step": 1310}, {"loss": 1.0788, "grad_norm": 0.35664042830467224, "learning_rate": 0.0002, "epoch": 1.444991789819376, "step": 1320}, {"loss": 1.0108, "grad_norm": 0.45126354694366455, "learning_rate": 0.0002, "epoch": 1.4559386973180077, "step": 1330}, {"loss": 0.9571, "grad_norm": 0.37318357825279236, "learning_rate": 0.0002, "epoch": 1.4668856048166394, "step": 1340}, {"loss": 1.0507, "grad_norm": 0.6428970098495483, "learning_rate": 0.0002, "epoch": 1.477832512315271, "step": 1350}, {"loss": 1.0367, "grad_norm": 0.43256187438964844, "learning_rate": 0.0002, "epoch": 1.4887794198139026, "step": 1360}, {"loss": 1.1321, "grad_norm": 0.5343793630599976, "learning_rate": 0.0002, "epoch": 1.4997263273125343, "step": 1370}, {"loss": 1.1054, "grad_norm": 0.315437376499176, "learning_rate": 0.0002, "epoch": 1.5106732348111658, "step": 1380}, {"loss": 0.8916, "grad_norm": 0.41561153531074524, "learning_rate": 0.0002, "epoch": 1.5216201423097975, "step": 1390}, {"loss": 1.0391, "grad_norm": 0.3201070725917816, "learning_rate": 0.0002, "epoch": 1.5325670498084292, "step": 1400}, {"loss": 1.14, "grad_norm": 0.505537211894989, "learning_rate": 0.0002, "epoch": 1.5435139573070606, "step": 1410}, {"loss": 1.0775, "grad_norm": 0.3747410178184509, "learning_rate": 0.0002, "epoch": 1.5544608648056923, "step": 1420}, {"loss": 1.1171, "grad_norm": 0.49385908246040344, "learning_rate": 0.0002, "epoch": 1.565407772304324, "step": 1430}, {"loss": 1.1182, "grad_norm": 0.49831628799438477, "learning_rate": 0.0002, "epoch": 1.5763546798029555, "step": 1440}, {"loss": 1.0079, "grad_norm": 0.372127890586853, "learning_rate": 0.0002, "epoch": 1.5873015873015874, "step": 1450}, {"loss": 0.9931, "grad_norm": 0.40070840716362, "learning_rate": 0.0002, "epoch": 1.598248494800219, "step": 1460}, {"loss": 0.8954, "grad_norm": 0.34907400608062744, "learning_rate": 0.0002, "epoch": 1.6091954022988506, "step": 1470}, {"loss": 0.9743, "grad_norm": 0.4632418751716614, "learning_rate": 0.0002, "epoch": 1.6201423097974823, "step": 1480}, {"loss": 1.0103, "grad_norm": 0.40164515376091003, "learning_rate": 0.0002, "epoch": 1.6310892172961138, "step": 1490}, {"loss": 0.9523, "grad_norm": 0.3214994966983795, "learning_rate": 0.0002, "epoch": 1.6420361247947455, "step": 1500}, {"loss": 1.0161, "grad_norm": 0.3727897107601166, "learning_rate": 0.0002, "epoch": 1.6529830322933772, "step": 1510}, {"loss": 1.0443, "grad_norm": 0.3817640542984009, "learning_rate": 0.0002, "epoch": 1.6639299397920086, "step": 1520}, {"loss": 1.0511, "grad_norm": 0.5592136979103088, "learning_rate": 0.0002, "epoch": 1.6748768472906403, "step": 1530}, {"loss": 0.9682, "grad_norm": 0.44636598229408264, "learning_rate": 0.0002, "epoch": 1.685823754789272, "step": 1540}, {"loss": 1.033, "grad_norm": 0.40441709756851196, "learning_rate": 0.0002, "epoch": 1.6967706622879035, "step": 1550}, {"loss": 1.0857, "grad_norm": 0.3243522644042969, "learning_rate": 0.0002, "epoch": 1.7077175697865354, "step": 1560}, {"loss": 0.9482, "grad_norm": 0.34277570247650146, "learning_rate": 0.0002, "epoch": 1.718664477285167, "step": 1570}, {"loss": 0.974, "grad_norm": 0.3279995024204254, "learning_rate": 0.0002, "epoch": 1.7296113847837986, "step": 1580}, {"loss": 0.9414, "grad_norm": 0.41968777775764465, "learning_rate": 0.0002, "epoch": 1.7405582922824303, "step": 1590}, {"loss": 0.9768, "grad_norm": 0.39464613795280457, "learning_rate": 0.0002, "epoch": 1.7515051997810618, "step": 1600}, {"loss": 1.0347, "grad_norm": 0.3839009404182434, "learning_rate": 0.0002, "epoch": 1.7624521072796935, "step": 1610}, {"loss": 0.9195, "grad_norm": 0.3250715434551239, "learning_rate": 0.0002, "epoch": 1.7733990147783252, "step": 1620}, {"loss": 1.0049, "grad_norm": 0.5166561007499695, "learning_rate": 0.0002, "epoch": 1.7843459222769567, "step": 1630}, {"loss": 1.0364, "grad_norm": 0.4115183353424072, "learning_rate": 0.0002, "epoch": 1.7952928297755884, "step": 1640}, {"loss": 1.0248, "grad_norm": 0.373780220746994, "learning_rate": 0.0002, "epoch": 1.80623973727422, "step": 1650}, {"loss": 1.0984, "grad_norm": 0.49697014689445496, "learning_rate": 0.0002, "epoch": 1.8171866447728515, "step": 1660}, {"loss": 1.0089, "grad_norm": 1.0308938026428223, "learning_rate": 0.0002, "epoch": 1.8281335522714834, "step": 1670}, {"loss": 1.0853, "grad_norm": 0.4851366877555847, "learning_rate": 0.0002, "epoch": 1.839080459770115, "step": 1680}, {"loss": 0.9533, "grad_norm": 0.3262481391429901, "learning_rate": 0.0002, "epoch": 1.8500273672687466, "step": 1690}, {"loss": 1.048, "grad_norm": 0.6904496550559998, "learning_rate": 0.0002, "epoch": 1.8609742747673783, "step": 1700}, {"loss": 1.0577, "grad_norm": 0.49789851903915405, "learning_rate": 0.0002, "epoch": 1.8719211822660098, "step": 1710}, {"loss": 1.0258, "grad_norm": 0.3035794198513031, "learning_rate": 0.0002, "epoch": 1.8828680897646415, "step": 1720}, {"loss": 0.9916, "grad_norm": 0.4588414430618286, "learning_rate": 0.0002, "epoch": 1.8938149972632732, "step": 1730}, {"loss": 0.9526, "grad_norm": 0.4313034117221832, "learning_rate": 0.0002, "epoch": 1.9047619047619047, "step": 1740}, {"loss": 1.0857, "grad_norm": 0.38562044501304626, "learning_rate": 0.0002, "epoch": 1.9157088122605364, "step": 1750}, {"loss": 0.9763, "grad_norm": 0.46947410702705383, "learning_rate": 0.0002, "epoch": 1.926655719759168, "step": 1760}, {"loss": 0.9967, "grad_norm": 0.3848404884338379, "learning_rate": 0.0002, "epoch": 1.9376026272577995, "step": 1770}, {"loss": 1.0474, "grad_norm": 0.30422744154930115, "learning_rate": 0.0002, "epoch": 1.9485495347564314, "step": 1780}, {"loss": 1.022, "grad_norm": 0.41100990772247314, "learning_rate": 0.0002, "epoch": 1.959496442255063, "step": 1790}, {"loss": 1.003, "grad_norm": 0.3492335081100464, "learning_rate": 0.0002, "epoch": 1.9704433497536946, "step": 1800}, {"loss": 0.9891, "grad_norm": 0.364577978849411, "learning_rate": 0.0002, "epoch": 1.9813902572523263, "step": 1810}, {"loss": 1.0699, "grad_norm": 0.4312075674533844, "learning_rate": 0.0002, "epoch": 1.9923371647509578, "step": 1820}, {"eval_loss": 1.14472496509552, "eval_runtime": 46.0786, "eval_samples_per_second": 9.462, "eval_steps_per_second": 1.194, "epoch": 2.0, "step": 1827}, {"loss": 1.0218, "grad_norm": 0.5989689230918884, "learning_rate": 0.0002, "epoch": 2.0032840722495897, "step": 1830}, {"loss": 0.9384, "grad_norm": 0.49720922112464905, "learning_rate": 0.0002, "epoch": 2.014230979748221, "step": 1840}, {"loss": 0.8482, "grad_norm": 0.42675456404685974, "learning_rate": 0.0002, "epoch": 2.0251778872468527, "step": 1850}, {"loss": 0.8487, "grad_norm": 0.4637208580970764, "learning_rate": 0.0002, "epoch": 2.0361247947454846, "step": 1860}, {"loss": 0.8212, "grad_norm": 0.8329976797103882, "learning_rate": 0.0002, "epoch": 2.047071702244116, "step": 1870}, {"loss": 0.9547, "grad_norm": 0.7869427800178528, "learning_rate": 0.0002, "epoch": 2.0580186097427475, "step": 1880}, {"loss": 0.9351, "grad_norm": 0.4927455186843872, "learning_rate": 0.0002, "epoch": 2.0689655172413794, "step": 1890}, {"loss": 0.8652, "grad_norm": 0.6264246702194214, "learning_rate": 0.0002, "epoch": 2.079912424740011, "step": 1900}, {"loss": 0.9095, "grad_norm": 1.1164122819900513, "learning_rate": 0.0002, "epoch": 2.0908593322386424, "step": 1910}, {"loss": 0.7823, "grad_norm": 0.5283981561660767, "learning_rate": 0.0002, "epoch": 2.1018062397372743, "step": 1920}, {"loss": 0.8065, "grad_norm": 0.45621731877326965, "learning_rate": 0.0002, "epoch": 2.112753147235906, "step": 1930}, {"loss": 0.9184, "grad_norm": 1.381791591644287, "learning_rate": 0.0002, "epoch": 2.1237000547345373, "step": 1940}, {"loss": 0.9006, "grad_norm": 0.5151259899139404, "learning_rate": 0.0002, "epoch": 2.134646962233169, "step": 1950}, {"loss": 0.8436, "grad_norm": 0.9806339740753174, "learning_rate": 0.0002, "epoch": 2.1455938697318007, "step": 1960}, {"loss": 0.8749, "grad_norm": 0.4734154939651489, "learning_rate": 0.0002, "epoch": 2.1565407772304326, "step": 1970}, {"loss": 0.9172, "grad_norm": 0.9553168416023254, "learning_rate": 0.0002, "epoch": 2.167487684729064, "step": 1980}, {"loss": 0.8047, "grad_norm": 0.5895838141441345, "learning_rate": 0.0002, "epoch": 2.1784345922276955, "step": 1990}, {"loss": 0.7841, "grad_norm": 0.4488855302333832, "learning_rate": 0.0002, "epoch": 2.1893814997263275, "step": 2000}, {"loss": 0.8205, "grad_norm": 1.0760235786437988, "learning_rate": 0.0002, "epoch": 2.200328407224959, "step": 2010}, {"loss": 0.7923, "grad_norm": 0.5038785338401794, "learning_rate": 0.0002, "epoch": 2.2112753147235904, "step": 2020}, {"loss": 0.8973, "grad_norm": 0.59819495677948, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 2030}, {"loss": 0.8009, "grad_norm": 0.5012075304985046, "learning_rate": 0.0002, "epoch": 2.233169129720854, "step": 2040}, {"loss": 0.9087, "grad_norm": 0.44978439807891846, "learning_rate": 0.0002, "epoch": 2.2441160372194853, "step": 2050}, {"loss": 0.823, "grad_norm": 0.5350462198257446, "learning_rate": 0.0002, "epoch": 2.255062944718117, "step": 2060}, {"loss": 0.8335, "grad_norm": 0.6020669937133789, "learning_rate": 0.0002, "epoch": 2.2660098522167487, "step": 2070}, {"loss": 0.8023, "grad_norm": 0.5246821045875549, "learning_rate": 0.0002, "epoch": 2.2769567597153806, "step": 2080}, {"loss": 0.8984, "grad_norm": 0.5711268782615662, "learning_rate": 0.0002, "epoch": 2.287903667214012, "step": 2090}, {"loss": 0.9093, "grad_norm": 0.617317259311676, "learning_rate": 0.0002, "epoch": 2.2988505747126435, "step": 2100}, {"loss": 0.8311, "grad_norm": 0.8608947396278381, "learning_rate": 0.0002, "epoch": 2.3097974822112755, "step": 2110}, {"loss": 0.7839, "grad_norm": 0.4739076793193817, "learning_rate": 0.0002, "epoch": 2.320744389709907, "step": 2120}, {"loss": 0.84, "grad_norm": 0.5538856983184814, "learning_rate": 0.0002, "epoch": 2.3316912972085384, "step": 2130}, {"loss": 0.8994, "grad_norm": 0.6064935326576233, "learning_rate": 0.0002, "epoch": 2.3426382047071703, "step": 2140}, {"loss": 0.7765, "grad_norm": 0.5019068121910095, "learning_rate": 0.0002, "epoch": 2.353585112205802, "step": 2150}, {"loss": 0.9576, "grad_norm": 0.45340514183044434, "learning_rate": 0.0002, "epoch": 2.3645320197044333, "step": 2160}, {"loss": 0.9254, "grad_norm": 0.7347203493118286, "learning_rate": 0.0002, "epoch": 2.375478927203065, "step": 2170}, {"loss": 0.8507, "grad_norm": 0.46922534704208374, "learning_rate": 0.0002, "epoch": 2.3864258347016967, "step": 2180}, {"loss": 0.8325, "grad_norm": 0.5507845878601074, "learning_rate": 0.0002, "epoch": 2.3973727422003286, "step": 2190}, {"loss": 0.8178, "grad_norm": 0.5621911883354187, "learning_rate": 0.0002, "epoch": 2.40831964969896, "step": 2200}, {"loss": 0.8569, "grad_norm": 0.5023514032363892, "learning_rate": 0.0002, "epoch": 2.4192665571975915, "step": 2210}, {"loss": 0.8355, "grad_norm": 0.6124861240386963, "learning_rate": 0.0002, "epoch": 2.4302134646962235, "step": 2220}, {"loss": 0.885, "grad_norm": 0.49614205956459045, "learning_rate": 0.0002, "epoch": 2.441160372194855, "step": 2230}, {"loss": 0.8008, "grad_norm": 0.6477900743484497, "learning_rate": 0.0002, "epoch": 2.4521072796934864, "step": 2240}, {"loss": 0.8622, "grad_norm": 0.5868843793869019, "learning_rate": 0.0002, "epoch": 2.4630541871921183, "step": 2250}, {"loss": 0.8498, "grad_norm": 0.4364610016345978, "learning_rate": 0.0002, "epoch": 2.47400109469075, "step": 2260}, {"loss": 0.8378, "grad_norm": 0.5792964696884155, "learning_rate": 0.0002, "epoch": 2.4849480021893813, "step": 2270}, {"loss": 0.8743, "grad_norm": 0.5421269536018372, "learning_rate": 0.0002, "epoch": 2.495894909688013, "step": 2280}, {"loss": 0.9637, "grad_norm": 0.5525493025779724, "learning_rate": 0.0002, "epoch": 2.5068418171866447, "step": 2290}, {"loss": 0.8075, "grad_norm": 0.6463850140571594, "learning_rate": 0.0002, "epoch": 2.5177887246852766, "step": 2300}, {"loss": 0.8591, "grad_norm": 0.6861311793327332, "learning_rate": 0.0002, "epoch": 2.528735632183908, "step": 2310}, {"loss": 0.9287, "grad_norm": 0.5563555359840393, "learning_rate": 0.0002, "epoch": 2.5396825396825395, "step": 2320}, {"loss": 0.945, "grad_norm": 0.5721169114112854, "learning_rate": 0.0002, "epoch": 2.5506294471811715, "step": 2330}, {"loss": 0.8271, "grad_norm": 0.5258274674415588, "learning_rate": 0.0002, "epoch": 2.561576354679803, "step": 2340}, {"loss": 0.8515, "grad_norm": 0.7057380676269531, "learning_rate": 0.0002, "epoch": 2.572523262178435, "step": 2350}, {"loss": 0.8615, "grad_norm": 0.6869027614593506, "learning_rate": 0.0002, "epoch": 2.5834701696770663, "step": 2360}, {"loss": 0.8043, "grad_norm": 0.4960809648036957, "learning_rate": 0.0002, "epoch": 2.594417077175698, "step": 2370}, {"loss": 0.8476, "grad_norm": 0.9288380146026611, "learning_rate": 0.0002, "epoch": 2.6053639846743293, "step": 2380}, {"loss": 0.873, "grad_norm": 0.3765334188938141, "learning_rate": 0.0002, "epoch": 2.616310892172961, "step": 2390}, {"loss": 0.8764, "grad_norm": 0.7487865686416626, "learning_rate": 0.0002, "epoch": 2.6272577996715927, "step": 2400}, {"loss": 0.7577, "grad_norm": 0.6141156554222107, "learning_rate": 0.0002, "epoch": 2.6382047071702246, "step": 2410}, {"loss": 0.8534, "grad_norm": 0.8420507907867432, "learning_rate": 0.0002, "epoch": 2.649151614668856, "step": 2420}, {"loss": 0.8311, "grad_norm": 0.53386390209198, "learning_rate": 0.0002, "epoch": 2.6600985221674875, "step": 2430}, {"loss": 0.8486, "grad_norm": 0.5520607233047485, "learning_rate": 0.0002, "epoch": 2.6710454296661195, "step": 2440}, {"loss": 0.8686, "grad_norm": 0.5337599515914917, "learning_rate": 0.0002, "epoch": 2.681992337164751, "step": 2450}, {"loss": 0.792, "grad_norm": 0.48790836334228516, "learning_rate": 0.0002, "epoch": 2.692939244663383, "step": 2460}, {"loss": 0.7864, "grad_norm": 0.8287786245346069, "learning_rate": 0.0002, "epoch": 2.7038861521620143, "step": 2470}, {"loss": 0.8244, "grad_norm": 0.5876168608665466, "learning_rate": 0.0002, "epoch": 2.714833059660646, "step": 2480}, {"loss": 0.8773, "grad_norm": 0.5206760764122009, "learning_rate": 0.0002, "epoch": 2.7257799671592773, "step": 2490}, {"loss": 0.8097, "grad_norm": 0.5619136691093445, "learning_rate": 0.0002, "epoch": 2.736726874657909, "step": 2500}, {"loss": 0.8377, "grad_norm": 0.5614883899688721, "learning_rate": 0.0002, "epoch": 2.7476737821565407, "step": 2510}, {"loss": 0.8817, "grad_norm": 0.6157700419425964, "learning_rate": 0.0002, "epoch": 2.7586206896551726, "step": 2520}, {"loss": 0.846, "grad_norm": 0.5529953837394714, "learning_rate": 0.0002, "epoch": 2.769567597153804, "step": 2530}, {"loss": 0.8881, "grad_norm": 0.6731224060058594, "learning_rate": 0.0002, "epoch": 2.7805145046524355, "step": 2540}, {"loss": 0.9125, "grad_norm": 0.6960386633872986, "learning_rate": 0.0002, "epoch": 2.7914614121510675, "step": 2550}, {"loss": 0.7823, "grad_norm": 0.5203493237495422, "learning_rate": 0.0002, "epoch": 2.802408319649699, "step": 2560}, {"loss": 0.7951, "grad_norm": 1.036837100982666, "learning_rate": 0.0002, "epoch": 2.813355227148331, "step": 2570}, {"loss": 0.8427, "grad_norm": 0.6125805377960205, "learning_rate": 0.0002, "epoch": 2.8243021346469623, "step": 2580}, {"loss": 0.8939, "grad_norm": 0.6298092603683472, "learning_rate": 0.0002, "epoch": 2.835249042145594, "step": 2590}, {"loss": 0.8845, "grad_norm": 0.5882203578948975, "learning_rate": 0.0002, "epoch": 2.8461959496442253, "step": 2600}, {"loss": 0.8921, "grad_norm": 0.8619399666786194, "learning_rate": 0.0002, "epoch": 2.857142857142857, "step": 2610}, {"loss": 0.8213, "grad_norm": 0.4722687304019928, "learning_rate": 0.0002, "epoch": 2.8680897646414887, "step": 2620}, {"loss": 0.9147, "grad_norm": 0.47399574518203735, "learning_rate": 0.0002, "epoch": 2.8790366721401206, "step": 2630}, {"loss": 0.8325, "grad_norm": 0.5639172792434692, "learning_rate": 0.0002, "epoch": 2.889983579638752, "step": 2640}, {"loss": 0.8653, "grad_norm": 0.4676816761493683, "learning_rate": 0.0002, "epoch": 2.9009304871373836, "step": 2650}, {"loss": 0.8966, "grad_norm": 0.6906291246414185, "learning_rate": 0.0002, "epoch": 2.9118773946360155, "step": 2660}, {"loss": 0.8966, "grad_norm": 0.4369746148586273, "learning_rate": 0.0002, "epoch": 2.922824302134647, "step": 2670}, {"loss": 0.9173, "grad_norm": 0.46423083543777466, "learning_rate": 0.0002, "epoch": 2.933771209633279, "step": 2680}, {"loss": 0.7961, "grad_norm": 0.5700525045394897, "learning_rate": 0.0002, "epoch": 2.9447181171319103, "step": 2690}, {"loss": 0.8172, "grad_norm": 0.6221476793289185, "learning_rate": 0.0002, "epoch": 2.955665024630542, "step": 2700}, {"loss": 0.8538, "grad_norm": 0.6102682948112488, "learning_rate": 0.0002, "epoch": 2.9666119321291733, "step": 2710}, {"loss": 0.9779, "grad_norm": 0.5317878723144531, "learning_rate": 0.0002, "epoch": 2.977558839627805, "step": 2720}, {"loss": 0.9314, "grad_norm": 0.4438510835170746, "learning_rate": 0.0002, "epoch": 2.9885057471264367, "step": 2730}, {"loss": 0.8553, "grad_norm": 0.5022130012512207, "learning_rate": 0.0002, "epoch": 2.9994526546250686, "step": 2740}, {"eval_loss": 1.1722838878631592, "eval_runtime": 46.0829, "eval_samples_per_second": 9.461, "eval_steps_per_second": 1.194, "epoch": 2.9994526546250686, "step": 2740}, {"loss": 0.6443, "grad_norm": 0.6384502053260803, "learning_rate": 0.0002, "epoch": 3.0103995621237, "step": 2750}, {"loss": 0.7123, "grad_norm": 0.9928722381591797, "learning_rate": 0.0002, "epoch": 3.0213464696223316, "step": 2760}, {"loss": 0.6045, "grad_norm": 0.7813051342964172, "learning_rate": 0.0002, "epoch": 3.0322933771209635, "step": 2770}, {"loss": 0.6042, "grad_norm": 1.0202556848526, "learning_rate": 0.0002, "epoch": 3.043240284619595, "step": 2780}, {"loss": 0.6356, "grad_norm": 0.7581062316894531, "learning_rate": 0.0002, "epoch": 3.0541871921182264, "step": 2790}, {"loss": 0.6349, "grad_norm": 0.6252710223197937, "learning_rate": 0.0002, "epoch": 3.0651340996168583, "step": 2800}, {"loss": 0.645, "grad_norm": 0.7738662958145142, "learning_rate": 0.0002, "epoch": 3.07608100711549, "step": 2810}, {"loss": 0.627, "grad_norm": 0.7381885051727295, "learning_rate": 0.0002, "epoch": 3.0870279146141213, "step": 2820}, {"loss": 0.6371, "grad_norm": 0.9197564721107483, "learning_rate": 0.0002, "epoch": 3.097974822112753, "step": 2830}, {"loss": 0.723, "grad_norm": 1.000976800918579, "learning_rate": 0.0002, "epoch": 3.1089217296113847, "step": 2840}, {"loss": 0.6631, "grad_norm": 0.7559131383895874, "learning_rate": 0.0002, "epoch": 3.1198686371100166, "step": 2850}, {"loss": 0.6252, "grad_norm": 0.7213780879974365, "learning_rate": 0.0002, "epoch": 3.130815544608648, "step": 2860}, {"loss": 0.6501, "grad_norm": 0.945939838886261, "learning_rate": 0.0002, "epoch": 3.1417624521072796, "step": 2870}, {"loss": 0.6129, "grad_norm": 0.7277454137802124, "learning_rate": 0.0002, "epoch": 3.1527093596059115, "step": 2880}, {"loss": 0.6423, "grad_norm": 0.762026846408844, "learning_rate": 0.0002, "epoch": 3.163656267104543, "step": 2890}, {"loss": 0.5332, "grad_norm": 0.6471221446990967, "learning_rate": 0.0002, "epoch": 3.1746031746031744, "step": 2900}, {"loss": 0.7981, "grad_norm": 0.6018978357315063, "learning_rate": 0.0002, "epoch": 3.1855500821018063, "step": 2910}, {"loss": 0.7274, "grad_norm": 0.8607320785522461, "learning_rate": 0.0002, "epoch": 3.196496989600438, "step": 2920}, {"loss": 0.6139, "grad_norm": 0.8854126334190369, "learning_rate": 0.0002, "epoch": 3.2074438970990693, "step": 2930}, {"loss": 0.6485, "grad_norm": 0.6620870232582092, "learning_rate": 0.0002, "epoch": 3.218390804597701, "step": 2940}, {"loss": 0.6969, "grad_norm": 0.7377511858940125, "learning_rate": 0.0002, "epoch": 3.2293377120963327, "step": 2950}, {"loss": 0.6798, "grad_norm": 0.7803301811218262, "learning_rate": 0.0002, "epoch": 3.2402846195949646, "step": 2960}, {"loss": 0.6697, "grad_norm": 0.834061861038208, "learning_rate": 0.0002, "epoch": 3.251231527093596, "step": 2970}, {"loss": 0.6894, "grad_norm": 0.8496041893959045, "learning_rate": 0.0002, "epoch": 3.2621784345922276, "step": 2980}, {"loss": 0.6591, "grad_norm": 0.7967984676361084, "learning_rate": 0.0002, "epoch": 3.2731253420908595, "step": 2990}, {"loss": 0.7266, "grad_norm": 1.0207016468048096, "learning_rate": 0.0002, "epoch": 3.284072249589491, "step": 3000}, {"loss": 0.6586, "grad_norm": 0.7855771780014038, "learning_rate": 0.0002, "epoch": 3.2950191570881224, "step": 3010}, {"loss": 0.5711, "grad_norm": 0.9427546858787537, "learning_rate": 0.0002, "epoch": 3.3059660645867543, "step": 3020}, {"loss": 0.6277, "grad_norm": 0.823542594909668, "learning_rate": 0.0002, "epoch": 3.316912972085386, "step": 3030}, {"loss": 0.7109, "grad_norm": 0.9826635122299194, "learning_rate": 0.0002, "epoch": 3.3278598795840173, "step": 3040}, {"loss": 0.6564, "grad_norm": 0.7259827852249146, "learning_rate": 0.0002, "epoch": 3.338806787082649, "step": 3050}, {"loss": 0.653, "grad_norm": 0.7774739861488342, "learning_rate": 0.0002, "epoch": 3.3497536945812807, "step": 3060}, {"loss": 0.7529, "grad_norm": 0.7394293546676636, "learning_rate": 0.0002, "epoch": 3.3607006020799126, "step": 3070}, {"loss": 0.5987, "grad_norm": 0.9017578959465027, "learning_rate": 0.0002, "epoch": 3.371647509578544, "step": 3080}, {"loss": 0.6953, "grad_norm": 0.7451054453849792, "learning_rate": 0.0002, "epoch": 3.3825944170771756, "step": 3090}, {"loss": 0.6759, "grad_norm": 0.7321506142616272, "learning_rate": 0.0002, "epoch": 3.3935413245758075, "step": 3100}, {"loss": 0.6555, "grad_norm": 0.6721828579902649, "learning_rate": 0.0002, "epoch": 3.404488232074439, "step": 3110}, {"loss": 0.6559, "grad_norm": 0.774022102355957, "learning_rate": 0.0002, "epoch": 3.4154351395730704, "step": 3120}, {"loss": 0.7449, "grad_norm": 0.9143537282943726, "learning_rate": 0.0002, "epoch": 3.4263820470717024, "step": 3130}, {"loss": 0.6899, "grad_norm": 1.226087212562561, "learning_rate": 0.0002, "epoch": 3.437328954570334, "step": 3140}, {"loss": 0.6719, "grad_norm": 0.7545496225357056, "learning_rate": 0.0002, "epoch": 3.4482758620689653, "step": 3150}, {"loss": 0.6153, "grad_norm": 0.6515635848045349, "learning_rate": 0.0002, "epoch": 3.4592227695675972, "step": 3160}, {"loss": 0.6926, "grad_norm": 0.9297090172767639, "learning_rate": 0.0002, "epoch": 3.4701696770662287, "step": 3170}, {"loss": 0.6071, "grad_norm": 1.0130730867385864, "learning_rate": 0.0002, "epoch": 3.4811165845648606, "step": 3180}, {"loss": 0.5959, "grad_norm": 0.7654589414596558, "learning_rate": 0.0002, "epoch": 3.492063492063492, "step": 3190}, {"loss": 0.7401, "grad_norm": 0.9954977631568909, "learning_rate": 0.0002, "epoch": 3.5030103995621236, "step": 3200}, {"loss": 0.6661, "grad_norm": 0.6027487516403198, "learning_rate": 0.0002, "epoch": 3.5139573070607555, "step": 3210}, {"loss": 0.6963, "grad_norm": 0.741770327091217, "learning_rate": 0.0002, "epoch": 3.524904214559387, "step": 3220}, {"loss": 0.8112, "grad_norm": 1.0534909963607788, "learning_rate": 0.0002, "epoch": 3.535851122058019, "step": 3230}, {"loss": 0.6813, "grad_norm": 0.937772274017334, "learning_rate": 0.0002, "epoch": 3.5467980295566504, "step": 3240}, {"loss": 0.6681, "grad_norm": 0.8504213690757751, "learning_rate": 0.0002, "epoch": 3.557744937055282, "step": 3250}, {"loss": 0.6436, "grad_norm": 0.7755007147789001, "learning_rate": 0.0002, "epoch": 3.5686918445539133, "step": 3260}, {"loss": 0.6213, "grad_norm": 1.0193358659744263, "learning_rate": 0.0002, "epoch": 3.5796387520525452, "step": 3270}, {"loss": 0.671, "grad_norm": 0.8440536856651306, "learning_rate": 0.0002, "epoch": 3.5905856595511767, "step": 3280}, {"loss": 0.6859, "grad_norm": 0.6195939183235168, "learning_rate": 0.0002, "epoch": 3.6015325670498086, "step": 3290}, {"loss": 0.7446, "grad_norm": 0.8608590960502625, "learning_rate": 0.0002, "epoch": 3.61247947454844, "step": 3300}, {"loss": 0.7301, "grad_norm": 0.6772327423095703, "learning_rate": 0.0002, "epoch": 3.6234263820470716, "step": 3310}, {"loss": 0.6298, "grad_norm": 0.8031839728355408, "learning_rate": 0.0002, "epoch": 3.6343732895457035, "step": 3320}, {"loss": 0.7041, "grad_norm": 0.6080502271652222, "learning_rate": 0.0002, "epoch": 3.645320197044335, "step": 3330}, {"loss": 0.7431, "grad_norm": 0.8007240891456604, "learning_rate": 0.0002, "epoch": 3.656267104542967, "step": 3340}, {"loss": 0.7446, "grad_norm": 0.8060704469680786, "learning_rate": 0.0002, "epoch": 3.6672140120415984, "step": 3350}, {"loss": 0.6304, "grad_norm": 0.7547586560249329, "learning_rate": 0.0002, "epoch": 3.67816091954023, "step": 3360}, {"loss": 0.7066, "grad_norm": 0.686851978302002, "learning_rate": 0.0002, "epoch": 3.6891078270388613, "step": 3370}, {"loss": 0.6748, "grad_norm": 0.9429075717926025, "learning_rate": 0.0002, "epoch": 3.7000547345374932, "step": 3380}, {"loss": 0.6673, "grad_norm": 0.7283591032028198, "learning_rate": 0.0002, "epoch": 3.7110016420361247, "step": 3390}, {"loss": 0.7502, "grad_norm": 0.8323085904121399, "learning_rate": 0.0002, "epoch": 3.7219485495347566, "step": 3400}, {"loss": 0.7779, "grad_norm": 0.8529590964317322, "learning_rate": 0.0002, "epoch": 3.732895457033388, "step": 3410}, {"loss": 0.6555, "grad_norm": 0.731752872467041, "learning_rate": 0.0002, "epoch": 3.7438423645320196, "step": 3420}, {"loss": 0.6928, "grad_norm": 0.8572278618812561, "learning_rate": 0.0002, "epoch": 3.7547892720306515, "step": 3430}, {"loss": 0.6215, "grad_norm": 0.7408691048622131, "learning_rate": 0.0002, "epoch": 3.765736179529283, "step": 3440}, {"loss": 0.622, "grad_norm": 0.7470445036888123, "learning_rate": 0.0002, "epoch": 3.776683087027915, "step": 3450}, {"loss": 0.7241, "grad_norm": 0.6806244254112244, "learning_rate": 0.0002, "epoch": 3.7876299945265464, "step": 3460}, {"loss": 0.7739, "grad_norm": 0.9129069447517395, "learning_rate": 0.0002, "epoch": 3.798576902025178, "step": 3470}, {"loss": 0.6826, "grad_norm": 0.8717501759529114, "learning_rate": 0.0002, "epoch": 3.8095238095238093, "step": 3480}, {"loss": 0.6188, "grad_norm": 0.6761979460716248, "learning_rate": 0.0002, "epoch": 3.8204707170224412, "step": 3490}, {"loss": 0.7601, "grad_norm": 1.0054380893707275, "learning_rate": 0.0002, "epoch": 3.8314176245210727, "step": 3500}, {"loss": 0.623, "grad_norm": 1.1224009990692139, "learning_rate": 0.0002, "epoch": 3.8423645320197046, "step": 3510}, {"loss": 0.6918, "grad_norm": 0.8997692465782166, "learning_rate": 0.0002, "epoch": 3.853311439518336, "step": 3520}, {"loss": 0.6357, "grad_norm": 1.0086902379989624, "learning_rate": 0.0002, "epoch": 3.8642583470169676, "step": 3530}, {"loss": 0.6379, "grad_norm": 0.772739589214325, "learning_rate": 0.0002, "epoch": 3.8752052545155995, "step": 3540}, {"loss": 0.7423, "grad_norm": 1.211774230003357, "learning_rate": 0.0002, "epoch": 3.886152162014231, "step": 3550}, {"loss": 0.7321, "grad_norm": 0.9572356939315796, "learning_rate": 0.0002, "epoch": 3.897099069512863, "step": 3560}, {"loss": 0.6836, "grad_norm": 0.7887842655181885, "learning_rate": 0.0002, "epoch": 3.9080459770114944, "step": 3570}, {"loss": 0.7576, "grad_norm": 0.7308389544487, "learning_rate": 0.0002, "epoch": 3.918992884510126, "step": 3580}, {"loss": 0.6001, "grad_norm": 1.0182650089263916, "learning_rate": 0.0002, "epoch": 3.9299397920087573, "step": 3590}, {"loss": 0.6942, "grad_norm": 0.8000147342681885, "learning_rate": 0.0002, "epoch": 3.9408866995073892, "step": 3600}, {"loss": 0.6244, "grad_norm": 0.7385728359222412, "learning_rate": 0.0002, "epoch": 3.9518336070060207, "step": 3610}, {"loss": 0.6718, "grad_norm": 0.9233261942863464, "learning_rate": 0.0002, "epoch": 3.9627805145046526, "step": 3620}, {"loss": 0.6508, "grad_norm": 0.8486751914024353, "learning_rate": 0.0002, "epoch": 3.973727422003284, "step": 3630}, {"loss": 0.6928, "grad_norm": 0.7593663334846497, "learning_rate": 0.0002, "epoch": 3.9846743295019156, "step": 3640}, {"loss": 0.6847, "grad_norm": 0.7885415554046631, "learning_rate": 0.0002, "epoch": 3.9956212370005475, "step": 3650}, {"eval_loss": 1.250312328338623, "eval_runtime": 46.0842, "eval_samples_per_second": 9.461, "eval_steps_per_second": 1.193, "epoch": 4.0, "step": 3654}, {"loss": 0.5547, "grad_norm": 0.6591703295707703, "learning_rate": 0.0002, "epoch": 4.006568144499179, "step": 3660}, {"loss": 0.5301, "grad_norm": 1.36927330493927, "learning_rate": 0.0002, "epoch": 4.017515051997811, "step": 3670}, {"loss": 0.4466, "grad_norm": 0.8106328845024109, "learning_rate": 0.0002, "epoch": 4.028461959496442, "step": 3680}, {"loss": 0.4861, "grad_norm": 0.7592712044715881, "learning_rate": 0.0002, "epoch": 4.039408866995074, "step": 3690}, {"loss": 0.5103, "grad_norm": 0.9518909454345703, "learning_rate": 0.0002, "epoch": 4.050355774493705, "step": 3700}, {"loss": 0.4638, "grad_norm": 0.7805967330932617, "learning_rate": 0.0002, "epoch": 4.061302681992337, "step": 3710}, {"loss": 0.4556, "grad_norm": 1.3146334886550903, "learning_rate": 0.0002, "epoch": 4.072249589490969, "step": 3720}, {"loss": 0.5635, "grad_norm": 1.1611138582229614, "learning_rate": 0.0002, "epoch": 4.083196496989601, "step": 3730}, {"loss": 0.3845, "grad_norm": 0.8173232078552246, "learning_rate": 0.0002, "epoch": 4.094143404488232, "step": 3740}, {"loss": 0.4911, "grad_norm": 0.7848323583602905, "learning_rate": 0.0002, "epoch": 4.105090311986864, "step": 3750}, {"loss": 0.4519, "grad_norm": 1.3183201551437378, "learning_rate": 0.0002, "epoch": 4.116037219485495, "step": 3760}, {"loss": 0.5083, "grad_norm": 1.1936529874801636, "learning_rate": 0.0002, "epoch": 4.1269841269841265, "step": 3770}, {"loss": 0.5208, "grad_norm": 1.1078993082046509, "learning_rate": 0.0002, "epoch": 4.137931034482759, "step": 3780}, {"loss": 0.5928, "grad_norm": 1.107743263244629, "learning_rate": 0.0002, "epoch": 4.14887794198139, "step": 3790}, {"loss": 0.5112, "grad_norm": 0.7801875472068787, "learning_rate": 0.0002, "epoch": 4.159824849480022, "step": 3800}, {"loss": 0.4896, "grad_norm": 1.1328117847442627, "learning_rate": 0.0002, "epoch": 4.170771756978653, "step": 3810}, {"loss": 0.5645, "grad_norm": 1.4232193231582642, "learning_rate": 0.0002, "epoch": 4.181718664477285, "step": 3820}, {"loss": 0.5049, "grad_norm": 1.557416558265686, "learning_rate": 0.0002, "epoch": 4.192665571975917, "step": 3830}, {"loss": 0.4863, "grad_norm": 1.042923092842102, "learning_rate": 0.0002, "epoch": 4.203612479474549, "step": 3840}, {"loss": 0.3751, "grad_norm": 1.1801949739456177, "learning_rate": 0.0002, "epoch": 4.21455938697318, "step": 3850}, {"loss": 0.5063, "grad_norm": 0.9273753762245178, "learning_rate": 0.0002, "epoch": 4.225506294471812, "step": 3860}, {"loss": 0.5542, "grad_norm": 0.7681763768196106, "learning_rate": 0.0002, "epoch": 4.236453201970443, "step": 3870}, {"loss": 0.5971, "grad_norm": 0.9840841293334961, "learning_rate": 0.0002, "epoch": 4.2474001094690745, "step": 3880}, {"loss": 0.4648, "grad_norm": 1.0290725231170654, "learning_rate": 0.0002, "epoch": 4.258347016967707, "step": 3890}, {"loss": 0.4288, "grad_norm": 0.8059597611427307, "learning_rate": 0.0002, "epoch": 4.269293924466338, "step": 3900}, {"loss": 0.5103, "grad_norm": 0.9847467541694641, "learning_rate": 0.0002, "epoch": 4.28024083196497, "step": 3910}, {"loss": 0.4952, "grad_norm": 1.344044804573059, "learning_rate": 0.0002, "epoch": 4.291187739463601, "step": 3920}, {"loss": 0.4966, "grad_norm": 0.9174224138259888, "learning_rate": 0.0002, "epoch": 4.302134646962233, "step": 3930}, {"loss": 0.4944, "grad_norm": 1.1199711561203003, "learning_rate": 0.0002, "epoch": 4.313081554460865, "step": 3940}, {"loss": 0.4641, "grad_norm": 1.0120296478271484, "learning_rate": 0.0002, "epoch": 4.324028461959497, "step": 3950}, {"loss": 0.4723, "grad_norm": 1.091811180114746, "learning_rate": 0.0002, "epoch": 4.334975369458128, "step": 3960}, {"loss": 0.4627, "grad_norm": 1.0332133769989014, "learning_rate": 0.0002, "epoch": 4.34592227695676, "step": 3970}, {"loss": 0.4646, "grad_norm": 1.0785295963287354, "learning_rate": 0.0002, "epoch": 4.356869184455391, "step": 3980}, {"loss": 0.4909, "grad_norm": 1.0506969690322876, "learning_rate": 0.0002, "epoch": 4.3678160919540225, "step": 3990}, {"loss": 0.4776, "grad_norm": 1.047560691833496, "learning_rate": 0.0002, "epoch": 4.378762999452655, "step": 4000}, {"loss": 0.4549, "grad_norm": 0.9348800778388977, "learning_rate": 0.0002, "epoch": 4.389709906951286, "step": 4010}, {"loss": 0.5333, "grad_norm": 1.1563059091567993, "learning_rate": 0.0002, "epoch": 4.400656814449918, "step": 4020}, {"loss": 0.4952, "grad_norm": 1.001470923423767, "learning_rate": 0.0002, "epoch": 4.411603721948549, "step": 4030}, {"loss": 0.4972, "grad_norm": 1.309012532234192, "learning_rate": 0.0002, "epoch": 4.422550629447181, "step": 4040}, {"loss": 0.5078, "grad_norm": 0.7338925004005432, "learning_rate": 0.0002, "epoch": 4.433497536945813, "step": 4050}, {"loss": 0.4632, "grad_norm": 1.0398834943771362, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 4060}, {"loss": 0.6285, "grad_norm": 0.9728689193725586, "learning_rate": 0.0002, "epoch": 4.455391351943076, "step": 4070}, {"loss": 0.4778, "grad_norm": 1.247475028038025, "learning_rate": 0.0002, "epoch": 4.466338259441708, "step": 4080}, {"loss": 0.4813, "grad_norm": 1.1084578037261963, "learning_rate": 0.0002, "epoch": 4.477285166940339, "step": 4090}, {"loss": 0.5665, "grad_norm": 1.1619318723678589, "learning_rate": 0.0002, "epoch": 4.4882320744389705, "step": 4100}, {"loss": 0.5207, "grad_norm": 1.3456498384475708, "learning_rate": 0.0002, "epoch": 4.499178981937603, "step": 4110}, {"loss": 0.4876, "grad_norm": 0.9372991323471069, "learning_rate": 0.0002, "epoch": 4.510125889436234, "step": 4120}, {"loss": 0.5456, "grad_norm": 1.0071815252304077, "learning_rate": 0.0002, "epoch": 4.521072796934866, "step": 4130}, {"loss": 0.5589, "grad_norm": 1.190344214439392, "learning_rate": 0.0002, "epoch": 4.532019704433497, "step": 4140}, {"loss": 0.4852, "grad_norm": 0.9480887055397034, "learning_rate": 0.0002, "epoch": 4.542966611932129, "step": 4150}, {"loss": 0.5229, "grad_norm": 1.0252189636230469, "learning_rate": 0.0002, "epoch": 4.553913519430761, "step": 4160}, {"loss": 0.5253, "grad_norm": 0.7142013311386108, "learning_rate": 0.0002, "epoch": 4.564860426929393, "step": 4170}, {"loss": 0.4861, "grad_norm": 0.8937426805496216, "learning_rate": 0.0002, "epoch": 4.575807334428024, "step": 4180}, {"loss": 0.4773, "grad_norm": 0.8885005116462708, "learning_rate": 0.0002, "epoch": 4.586754241926656, "step": 4190}, {"loss": 0.4858, "grad_norm": 1.337663173675537, "learning_rate": 0.0002, "epoch": 4.597701149425287, "step": 4200}, {"loss": 0.5247, "grad_norm": 1.0475375652313232, "learning_rate": 0.0002, "epoch": 4.6086480569239185, "step": 4210}, {"loss": 0.5298, "grad_norm": 1.0081088542938232, "learning_rate": 0.0002, "epoch": 4.619594964422551, "step": 4220}, {"loss": 0.5042, "grad_norm": 0.7527595162391663, "learning_rate": 0.0002, "epoch": 4.630541871921182, "step": 4230}, {"loss": 0.5207, "grad_norm": 1.55559241771698, "learning_rate": 0.0002, "epoch": 4.641488779419814, "step": 4240}, {"loss": 0.5468, "grad_norm": 0.7967379689216614, "learning_rate": 0.0002, "epoch": 4.652435686918445, "step": 4250}, {"loss": 0.5328, "grad_norm": 0.898368775844574, "learning_rate": 0.0002, "epoch": 4.663382594417077, "step": 4260}, {"loss": 0.4706, "grad_norm": 1.1940776109695435, "learning_rate": 0.0002, "epoch": 4.674329501915709, "step": 4270}, {"loss": 0.5121, "grad_norm": 1.1817092895507812, "learning_rate": 0.0002, "epoch": 4.685276409414341, "step": 4280}, {"loss": 0.5758, "grad_norm": 0.9041520357131958, "learning_rate": 0.0002, "epoch": 4.696223316912972, "step": 4290}, {"loss": 0.5851, "grad_norm": 1.1280102729797363, "learning_rate": 0.0002, "epoch": 4.707170224411604, "step": 4300}, {"loss": 0.4891, "grad_norm": 1.357689619064331, "learning_rate": 0.0002, "epoch": 4.718117131910235, "step": 4310}, {"loss": 0.4704, "grad_norm": 1.056633472442627, "learning_rate": 0.0002, "epoch": 4.7290640394088665, "step": 4320}, {"loss": 0.5488, "grad_norm": 1.6520427465438843, "learning_rate": 0.0002, "epoch": 4.740010946907499, "step": 4330}, {"loss": 0.5131, "grad_norm": 1.153200626373291, "learning_rate": 0.0002, "epoch": 4.75095785440613, "step": 4340}, {"loss": 0.539, "grad_norm": 0.9346241354942322, "learning_rate": 0.0002, "epoch": 4.761904761904762, "step": 4350}, {"loss": 0.4941, "grad_norm": 0.8628455996513367, "learning_rate": 0.0002, "epoch": 4.772851669403393, "step": 4360}, {"loss": 0.5167, "grad_norm": 1.3843916654586792, "learning_rate": 0.0002, "epoch": 4.783798576902025, "step": 4370}, {"loss": 0.4683, "grad_norm": 1.035574197769165, "learning_rate": 0.0002, "epoch": 4.794745484400657, "step": 4380}, {"loss": 0.5162, "grad_norm": 1.1868361234664917, "learning_rate": 0.0002, "epoch": 4.805692391899289, "step": 4390}, {"loss": 0.534, "grad_norm": 1.1307647228240967, "learning_rate": 0.0002, "epoch": 4.81663929939792, "step": 4400}, {"loss": 0.5567, "grad_norm": 0.9787724614143372, "learning_rate": 0.0002, "epoch": 4.827586206896552, "step": 4410}, {"loss": 0.5185, "grad_norm": 1.0473824739456177, "learning_rate": 0.0002, "epoch": 4.838533114395183, "step": 4420}, {"loss": 0.6285, "grad_norm": 1.069069504737854, "learning_rate": 0.0002, "epoch": 4.8494800218938146, "step": 4430}, {"loss": 0.5267, "grad_norm": 1.4305680990219116, "learning_rate": 0.0002, "epoch": 4.860426929392447, "step": 4440}, {"loss": 0.5947, "grad_norm": 1.3679203987121582, "learning_rate": 0.0002, "epoch": 4.871373836891078, "step": 4450}, {"loss": 0.5135, "grad_norm": 0.8997844457626343, "learning_rate": 0.0002, "epoch": 4.88232074438971, "step": 4460}, {"loss": 0.5312, "grad_norm": 1.2758110761642456, "learning_rate": 0.0002, "epoch": 4.893267651888341, "step": 4470}, {"loss": 0.4914, "grad_norm": 0.8819465637207031, "learning_rate": 0.0002, "epoch": 4.904214559386973, "step": 4480}, {"loss": 0.5147, "grad_norm": 1.08329439163208, "learning_rate": 0.0002, "epoch": 4.915161466885605, "step": 4490}, {"loss": 0.5404, "grad_norm": 1.083461046218872, "learning_rate": 0.0002, "epoch": 4.926108374384237, "step": 4500}, {"loss": 0.5433, "grad_norm": 1.2387723922729492, "learning_rate": 0.0002, "epoch": 4.937055281882868, "step": 4510}, {"loss": 0.5624, "grad_norm": 0.8262293934822083, "learning_rate": 0.0002, "epoch": 4.9480021893815, "step": 4520}, {"loss": 0.504, "grad_norm": 1.2325191497802734, "learning_rate": 0.0002, "epoch": 4.958949096880131, "step": 4530}, {"loss": 0.5452, "grad_norm": 1.024614930152893, "learning_rate": 0.0002, "epoch": 4.9698960043787626, "step": 4540}, {"loss": 0.4752, "grad_norm": 1.3007521629333496, "learning_rate": 0.0002, "epoch": 4.980842911877395, "step": 4550}, {"loss": 0.4943, "grad_norm": 0.9823828339576721, "learning_rate": 0.0002, "epoch": 4.991789819376026, "step": 4560}]} +{"epoch": 6.0, "step": 5481, "epoch_duration": 1359.439982175827, "total_accumulated_duration": 8170.820874452591, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4039, "grad_norm": 0.7611560821533203, "learning_rate": 0.0002, "epoch": 0.010946907498631636, "step": 10}, {"loss": 1.8818, "grad_norm": 0.4633193612098694, "learning_rate": 0.0002, "epoch": 0.021893814997263273, "step": 20}, {"loss": 1.5927, "grad_norm": 0.49326154589653015, "learning_rate": 0.0002, "epoch": 0.03284072249589491, "step": 30}, {"loss": 1.3859, "grad_norm": 0.3943138122558594, "learning_rate": 0.0002, "epoch": 0.043787629994526546, "step": 40}, {"loss": 1.3036, "grad_norm": 0.43292930722236633, "learning_rate": 0.0002, "epoch": 0.05473453749315818, "step": 50}, {"loss": 1.2427, "grad_norm": 0.3431817591190338, "learning_rate": 0.0002, "epoch": 0.06568144499178982, "step": 60}, {"loss": 1.3076, "grad_norm": 0.38011446595191956, "learning_rate": 0.0002, "epoch": 0.07662835249042145, "step": 70}, {"loss": 1.1641, "grad_norm": 0.7441071271896362, "learning_rate": 0.0002, "epoch": 0.08757525998905309, "step": 80}, {"loss": 1.1721, "grad_norm": 0.3359833061695099, "learning_rate": 0.0002, "epoch": 0.09852216748768473, "step": 90}, {"loss": 1.2774, "grad_norm": 0.3724392354488373, "learning_rate": 0.0002, "epoch": 0.10946907498631636, "step": 100}, {"loss": 1.216, "grad_norm": 0.40673762559890747, "learning_rate": 0.0002, "epoch": 0.120415982484948, "step": 110}, {"loss": 1.2611, "grad_norm": 0.40036800503730774, "learning_rate": 0.0002, "epoch": 0.13136288998357964, "step": 120}, {"loss": 1.2436, "grad_norm": 2.844191312789917, "learning_rate": 0.0002, "epoch": 0.1423097974822113, "step": 130}, {"loss": 1.2254, "grad_norm": 0.3104734420776367, "learning_rate": 0.0002, "epoch": 0.1532567049808429, "step": 140}, {"loss": 1.0498, "grad_norm": 0.3266797959804535, "learning_rate": 0.0002, "epoch": 0.16420361247947454, "step": 150}, {"loss": 1.1644, "grad_norm": 0.3079199194908142, "learning_rate": 0.0002, "epoch": 0.17515051997810618, "step": 160}, {"loss": 1.2923, "grad_norm": 0.3872479498386383, "learning_rate": 0.0002, "epoch": 0.18609742747673783, "step": 170}, {"loss": 1.1809, "grad_norm": 0.38654500246047974, "learning_rate": 0.0002, "epoch": 0.19704433497536947, "step": 180}, {"loss": 1.066, "grad_norm": 0.2913552522659302, "learning_rate": 0.0002, "epoch": 0.20799124247400108, "step": 190}, {"loss": 1.0868, "grad_norm": 0.2960572838783264, "learning_rate": 0.0002, "epoch": 0.21893814997263272, "step": 200}, {"loss": 1.136, "grad_norm": 0.5175501108169556, "learning_rate": 0.0002, "epoch": 0.22988505747126436, "step": 210}, {"loss": 1.1556, "grad_norm": 1.2921574115753174, "learning_rate": 0.0002, "epoch": 0.240831964969896, "step": 220}, {"loss": 1.1488, "grad_norm": 0.2675512135028839, "learning_rate": 0.0002, "epoch": 0.25177887246852765, "step": 230}, {"loss": 1.2764, "grad_norm": 0.3956190049648285, "learning_rate": 0.0002, "epoch": 0.2627257799671593, "step": 240}, {"loss": 1.1889, "grad_norm": 0.6022581458091736, "learning_rate": 0.0002, "epoch": 0.27367268746579093, "step": 250}, {"loss": 1.1981, "grad_norm": 1.1949563026428223, "learning_rate": 0.0002, "epoch": 0.2846195949644226, "step": 260}, {"loss": 1.1877, "grad_norm": 0.31173548102378845, "learning_rate": 0.0002, "epoch": 0.2955665024630542, "step": 270}, {"loss": 1.06, "grad_norm": 0.2808472812175751, "learning_rate": 0.0002, "epoch": 0.3065134099616858, "step": 280}, {"loss": 1.1752, "grad_norm": 0.3042023777961731, "learning_rate": 0.0002, "epoch": 0.31746031746031744, "step": 290}, {"loss": 1.3147, "grad_norm": 0.39915043115615845, "learning_rate": 0.0002, "epoch": 0.3284072249589491, "step": 300}, {"loss": 1.2425, "grad_norm": 0.39118197560310364, "learning_rate": 0.0002, "epoch": 0.3393541324575807, "step": 310}, {"loss": 1.1363, "grad_norm": 0.355010986328125, "learning_rate": 0.0002, "epoch": 0.35030103995621237, "step": 320}, {"loss": 1.1925, "grad_norm": 0.29734086990356445, "learning_rate": 0.0002, "epoch": 0.361247947454844, "step": 330}, {"loss": 1.1974, "grad_norm": 0.346096009016037, "learning_rate": 0.0002, "epoch": 0.37219485495347565, "step": 340}, {"loss": 1.1641, "grad_norm": 0.4829643666744232, "learning_rate": 0.0002, "epoch": 0.3831417624521073, "step": 350}, {"loss": 1.2808, "grad_norm": 0.4726872742176056, "learning_rate": 0.0002, "epoch": 0.39408866995073893, "step": 360}, {"loss": 1.1532, "grad_norm": 0.3130153715610504, "learning_rate": 0.0002, "epoch": 0.4050355774493706, "step": 370}, {"loss": 1.1842, "grad_norm": 0.5123590230941772, "learning_rate": 0.0002, "epoch": 0.41598248494800216, "step": 380}, {"loss": 1.1539, "grad_norm": 0.3444574773311615, "learning_rate": 0.0002, "epoch": 0.4269293924466338, "step": 390}, {"loss": 1.1756, "grad_norm": 0.5302175283432007, "learning_rate": 0.0002, "epoch": 0.43787629994526545, "step": 400}, {"loss": 1.1138, "grad_norm": 0.2713572680950165, "learning_rate": 0.0002, "epoch": 0.4488232074438971, "step": 410}, {"loss": 1.1281, "grad_norm": 0.29530611634254456, "learning_rate": 0.0002, "epoch": 0.45977011494252873, "step": 420}, {"loss": 1.1721, "grad_norm": 0.27282455563545227, "learning_rate": 0.0002, "epoch": 0.47071702244116037, "step": 430}, {"loss": 1.1213, "grad_norm": 0.2647949755191803, "learning_rate": 0.0002, "epoch": 0.481663929939792, "step": 440}, {"loss": 1.1656, "grad_norm": 0.35509347915649414, "learning_rate": 0.0002, "epoch": 0.49261083743842365, "step": 450}, {"loss": 1.1251, "grad_norm": 0.1959609091281891, "learning_rate": 0.0002, "epoch": 0.5035577449370553, "step": 460}, {"loss": 1.1889, "grad_norm": 0.40090155601501465, "learning_rate": 0.0002, "epoch": 0.5145046524356869, "step": 470}, {"loss": 1.156, "grad_norm": 0.3354604244232178, "learning_rate": 0.0002, "epoch": 0.5254515599343186, "step": 480}, {"loss": 1.1024, "grad_norm": 0.2758506238460541, "learning_rate": 0.0002, "epoch": 0.5363984674329502, "step": 490}, {"loss": 1.1108, "grad_norm": 0.3629051744937897, "learning_rate": 0.0002, "epoch": 0.5473453749315819, "step": 500}, {"loss": 1.2236, "grad_norm": 0.30802229046821594, "learning_rate": 0.0002, "epoch": 0.5582922824302134, "step": 510}, {"loss": 1.0424, "grad_norm": 0.3099463880062103, "learning_rate": 0.0002, "epoch": 0.5692391899288451, "step": 520}, {"loss": 1.255, "grad_norm": 0.42299067974090576, "learning_rate": 0.0002, "epoch": 0.5801860974274767, "step": 530}, {"loss": 1.1698, "grad_norm": 0.5392252802848816, "learning_rate": 0.0002, "epoch": 0.5911330049261084, "step": 540}, {"loss": 1.1171, "grad_norm": 0.34768250584602356, "learning_rate": 0.0002, "epoch": 0.60207991242474, "step": 550}, {"loss": 1.102, "grad_norm": 0.28490015864372253, "learning_rate": 0.0002, "epoch": 0.6130268199233716, "step": 560}, {"loss": 1.1111, "grad_norm": 0.34787994623184204, "learning_rate": 0.0002, "epoch": 0.6239737274220033, "step": 570}, {"loss": 1.0759, "grad_norm": 0.29058772325515747, "learning_rate": 0.0002, "epoch": 0.6349206349206349, "step": 580}, {"loss": 1.1157, "grad_norm": 0.4063778817653656, "learning_rate": 0.0002, "epoch": 0.6458675424192666, "step": 590}, {"loss": 1.1432, "grad_norm": 0.9244267344474792, "learning_rate": 0.0002, "epoch": 0.6568144499178982, "step": 600}, {"loss": 1.0591, "grad_norm": 0.27605190873146057, "learning_rate": 0.0002, "epoch": 0.6677613574165299, "step": 610}, {"loss": 1.2123, "grad_norm": 0.34346821904182434, "learning_rate": 0.0002, "epoch": 0.6787082649151615, "step": 620}, {"loss": 1.2195, "grad_norm": 0.3093279302120209, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 630}, {"loss": 1.2461, "grad_norm": 1.0069009065628052, "learning_rate": 0.0002, "epoch": 0.7006020799124247, "step": 640}, {"loss": 1.0892, "grad_norm": 0.5049130916595459, "learning_rate": 0.0002, "epoch": 0.7115489874110563, "step": 650}, {"loss": 1.1404, "grad_norm": 0.3748924732208252, "learning_rate": 0.0002, "epoch": 0.722495894909688, "step": 660}, {"loss": 1.1062, "grad_norm": 0.2964959144592285, "learning_rate": 0.0002, "epoch": 0.7334428024083196, "step": 670}, {"loss": 1.2617, "grad_norm": 0.4599986970424652, "learning_rate": 0.0002, "epoch": 0.7443897099069513, "step": 680}, {"loss": 1.088, "grad_norm": 0.27292951941490173, "learning_rate": 0.0002, "epoch": 0.7553366174055829, "step": 690}, {"loss": 1.2047, "grad_norm": 0.3123566806316376, "learning_rate": 0.0002, "epoch": 0.7662835249042146, "step": 700}, {"loss": 1.0021, "grad_norm": 0.28310710191726685, "learning_rate": 0.0002, "epoch": 0.7772304324028462, "step": 710}, {"loss": 1.2281, "grad_norm": 0.3279992341995239, "learning_rate": 0.0002, "epoch": 0.7881773399014779, "step": 720}, {"loss": 1.25, "grad_norm": 0.28179168701171875, "learning_rate": 0.0002, "epoch": 0.7991242474001095, "step": 730}, {"loss": 1.0602, "grad_norm": 0.31492987275123596, "learning_rate": 0.0002, "epoch": 0.8100711548987412, "step": 740}, {"loss": 1.2518, "grad_norm": 0.41821011900901794, "learning_rate": 0.0002, "epoch": 0.8210180623973727, "step": 750}, {"loss": 1.1612, "grad_norm": 0.325235515832901, "learning_rate": 0.0002, "epoch": 0.8319649698960043, "step": 760}, {"loss": 1.27, "grad_norm": 0.5366070866584778, "learning_rate": 0.0002, "epoch": 0.842911877394636, "step": 770}, {"loss": 1.0921, "grad_norm": 0.32570579648017883, "learning_rate": 0.0002, "epoch": 0.8538587848932676, "step": 780}, {"loss": 1.1032, "grad_norm": 0.3642968237400055, "learning_rate": 0.0002, "epoch": 0.8648056923918993, "step": 790}, {"loss": 1.1234, "grad_norm": 0.29713448882102966, "learning_rate": 0.0002, "epoch": 0.8757525998905309, "step": 800}, {"loss": 1.0978, "grad_norm": 0.23599444329738617, "learning_rate": 0.0002, "epoch": 0.8866995073891626, "step": 810}, {"loss": 1.1867, "grad_norm": 0.31522464752197266, "learning_rate": 0.0002, "epoch": 0.8976464148877942, "step": 820}, {"loss": 1.0208, "grad_norm": 0.32754790782928467, "learning_rate": 0.0002, "epoch": 0.9085933223864259, "step": 830}, {"loss": 0.9786, "grad_norm": 0.22741089761257172, "learning_rate": 0.0002, "epoch": 0.9195402298850575, "step": 840}, {"loss": 1.0689, "grad_norm": 0.3089679777622223, "learning_rate": 0.0002, "epoch": 0.9304871373836892, "step": 850}, {"loss": 1.0354, "grad_norm": 0.27440521121025085, "learning_rate": 0.0002, "epoch": 0.9414340448823207, "step": 860}, {"loss": 1.0417, "grad_norm": 0.3498363792896271, "learning_rate": 0.0002, "epoch": 0.9523809523809523, "step": 870}, {"loss": 1.269, "grad_norm": 0.47151854634284973, "learning_rate": 0.0002, "epoch": 0.963327859879584, "step": 880}, {"loss": 1.1174, "grad_norm": 0.24756591022014618, "learning_rate": 0.0002, "epoch": 0.9742747673782156, "step": 890}, {"loss": 1.0622, "grad_norm": 0.2600938677787781, "learning_rate": 0.0002, "epoch": 0.9852216748768473, "step": 900}, {"loss": 1.0968, "grad_norm": 0.2934586703777313, "learning_rate": 0.0002, "epoch": 0.9961685823754789, "step": 910}, {"eval_loss": 1.158464789390564, "eval_runtime": 46.0774, "eval_samples_per_second": 9.462, "eval_steps_per_second": 1.194, "epoch": 0.9994526546250684, "step": 913}, {"loss": 1.0322, "grad_norm": 0.37776654958724976, "learning_rate": 0.0002, "epoch": 1.0071154898741106, "step": 920}, {"loss": 1.1346, "grad_norm": 0.31784629821777344, "learning_rate": 0.0002, "epoch": 1.0180623973727423, "step": 930}, {"loss": 0.8976, "grad_norm": 0.24244336783885956, "learning_rate": 0.0002, "epoch": 1.0290093048713738, "step": 940}, {"loss": 1.0172, "grad_norm": 0.3185454308986664, "learning_rate": 0.0002, "epoch": 1.0399562123700055, "step": 950}, {"loss": 1.1645, "grad_norm": 0.3589441478252411, "learning_rate": 0.0002, "epoch": 1.0509031198686372, "step": 960}, {"loss": 1.0317, "grad_norm": 0.38593578338623047, "learning_rate": 0.0002, "epoch": 1.0618500273672686, "step": 970}, {"loss": 0.9913, "grad_norm": 0.39694955945014954, "learning_rate": 0.0002, "epoch": 1.0727969348659003, "step": 980}, {"loss": 1.0893, "grad_norm": 0.469817191362381, "learning_rate": 0.0002, "epoch": 1.083743842364532, "step": 990}, {"loss": 0.983, "grad_norm": 0.2634755074977875, "learning_rate": 0.0002, "epoch": 1.0946907498631637, "step": 1000}, {"loss": 1.0144, "grad_norm": 0.43189436197280884, "learning_rate": 0.0002, "epoch": 1.1056376573617952, "step": 1010}, {"loss": 0.9663, "grad_norm": 0.5559977889060974, "learning_rate": 0.0002, "epoch": 1.116584564860427, "step": 1020}, {"loss": 1.0481, "grad_norm": 0.32100191712379456, "learning_rate": 0.0002, "epoch": 1.1275314723590586, "step": 1030}, {"loss": 1.1012, "grad_norm": 0.40179768204689026, "learning_rate": 0.0002, "epoch": 1.1384783798576903, "step": 1040}, {"loss": 1.0029, "grad_norm": 0.3659493029117584, "learning_rate": 0.0002, "epoch": 1.1494252873563218, "step": 1050}, {"loss": 0.9597, "grad_norm": 0.701704204082489, "learning_rate": 0.0002, "epoch": 1.1603721948549535, "step": 1060}, {"loss": 1.0204, "grad_norm": 0.3650563359260559, "learning_rate": 0.0002, "epoch": 1.1713191023535852, "step": 1070}, {"loss": 0.907, "grad_norm": 0.3191976249217987, "learning_rate": 0.0002, "epoch": 1.1822660098522166, "step": 1080}, {"loss": 1.0648, "grad_norm": 0.3615441918373108, "learning_rate": 0.0002, "epoch": 1.1932129173508483, "step": 1090}, {"loss": 1.0067, "grad_norm": 0.39474231004714966, "learning_rate": 0.0002, "epoch": 1.20415982484948, "step": 1100}, {"loss": 0.9852, "grad_norm": 0.3752822279930115, "learning_rate": 0.0002, "epoch": 1.2151067323481117, "step": 1110}, {"loss": 1.0067, "grad_norm": 0.4165991246700287, "learning_rate": 0.0002, "epoch": 1.2260536398467432, "step": 1120}, {"loss": 1.0244, "grad_norm": 0.5326506495475769, "learning_rate": 0.0002, "epoch": 1.237000547345375, "step": 1130}, {"loss": 1.0542, "grad_norm": 0.48845794796943665, "learning_rate": 0.0002, "epoch": 1.2479474548440066, "step": 1140}, {"loss": 1.0885, "grad_norm": 0.29910150170326233, "learning_rate": 0.0002, "epoch": 1.2588943623426383, "step": 1150}, {"loss": 1.2233, "grad_norm": 0.5069725513458252, "learning_rate": 0.0002, "epoch": 1.2698412698412698, "step": 1160}, {"loss": 1.0992, "grad_norm": 0.29500406980514526, "learning_rate": 0.0002, "epoch": 1.2807881773399015, "step": 1170}, {"loss": 1.0291, "grad_norm": 0.4711538851261139, "learning_rate": 0.0002, "epoch": 1.2917350848385332, "step": 1180}, {"loss": 1.0513, "grad_norm": 0.4203340709209442, "learning_rate": 0.0002, "epoch": 1.3026819923371646, "step": 1190}, {"loss": 1.1274, "grad_norm": 0.36101874709129333, "learning_rate": 0.0002, "epoch": 1.3136288998357963, "step": 1200}, {"loss": 1.085, "grad_norm": 0.4608800411224365, "learning_rate": 0.0002, "epoch": 1.324575807334428, "step": 1210}, {"loss": 1.0695, "grad_norm": 0.6570906639099121, "learning_rate": 0.0002, "epoch": 1.3355227148330597, "step": 1220}, {"loss": 0.9786, "grad_norm": 0.5352164506912231, "learning_rate": 0.0002, "epoch": 1.3464696223316914, "step": 1230}, {"loss": 1.0513, "grad_norm": 0.3885001242160797, "learning_rate": 0.0002, "epoch": 1.357416529830323, "step": 1240}, {"loss": 1.0611, "grad_norm": 0.2987913489341736, "learning_rate": 0.0002, "epoch": 1.3683634373289546, "step": 1250}, {"loss": 1.109, "grad_norm": 0.42070427536964417, "learning_rate": 0.0002, "epoch": 1.3793103448275863, "step": 1260}, {"loss": 1.1092, "grad_norm": 0.5957782864570618, "learning_rate": 0.0002, "epoch": 1.3902572523262178, "step": 1270}, {"loss": 1.0554, "grad_norm": 0.32898882031440735, "learning_rate": 0.0002, "epoch": 1.4012041598248495, "step": 1280}, {"loss": 0.909, "grad_norm": 0.27624452114105225, "learning_rate": 0.0002, "epoch": 1.4121510673234812, "step": 1290}, {"loss": 1.024, "grad_norm": 0.49570828676223755, "learning_rate": 0.0002, "epoch": 1.4230979748221126, "step": 1300}, {"loss": 1.0471, "grad_norm": 0.26191383600234985, "learning_rate": 0.0002, "epoch": 1.4340448823207443, "step": 1310}, {"loss": 1.0788, "grad_norm": 0.35664042830467224, "learning_rate": 0.0002, "epoch": 1.444991789819376, "step": 1320}, {"loss": 1.0108, "grad_norm": 0.45126354694366455, "learning_rate": 0.0002, "epoch": 1.4559386973180077, "step": 1330}, {"loss": 0.9571, "grad_norm": 0.37318357825279236, "learning_rate": 0.0002, "epoch": 1.4668856048166394, "step": 1340}, {"loss": 1.0507, "grad_norm": 0.6428970098495483, "learning_rate": 0.0002, "epoch": 1.477832512315271, "step": 1350}, {"loss": 1.0367, "grad_norm": 0.43256187438964844, "learning_rate": 0.0002, "epoch": 1.4887794198139026, "step": 1360}, {"loss": 1.1321, "grad_norm": 0.5343793630599976, "learning_rate": 0.0002, "epoch": 1.4997263273125343, "step": 1370}, {"loss": 1.1054, "grad_norm": 0.315437376499176, "learning_rate": 0.0002, "epoch": 1.5106732348111658, "step": 1380}, {"loss": 0.8916, "grad_norm": 0.41561153531074524, "learning_rate": 0.0002, "epoch": 1.5216201423097975, "step": 1390}, {"loss": 1.0391, "grad_norm": 0.3201070725917816, "learning_rate": 0.0002, "epoch": 1.5325670498084292, "step": 1400}, {"loss": 1.14, "grad_norm": 0.505537211894989, "learning_rate": 0.0002, "epoch": 1.5435139573070606, "step": 1410}, {"loss": 1.0775, "grad_norm": 0.3747410178184509, "learning_rate": 0.0002, "epoch": 1.5544608648056923, "step": 1420}, {"loss": 1.1171, "grad_norm": 0.49385908246040344, "learning_rate": 0.0002, "epoch": 1.565407772304324, "step": 1430}, {"loss": 1.1182, "grad_norm": 0.49831628799438477, "learning_rate": 0.0002, "epoch": 1.5763546798029555, "step": 1440}, {"loss": 1.0079, "grad_norm": 0.372127890586853, "learning_rate": 0.0002, "epoch": 1.5873015873015874, "step": 1450}, {"loss": 0.9931, "grad_norm": 0.40070840716362, "learning_rate": 0.0002, "epoch": 1.598248494800219, "step": 1460}, {"loss": 0.8954, "grad_norm": 0.34907400608062744, "learning_rate": 0.0002, "epoch": 1.6091954022988506, "step": 1470}, {"loss": 0.9743, "grad_norm": 0.4632418751716614, "learning_rate": 0.0002, "epoch": 1.6201423097974823, "step": 1480}, {"loss": 1.0103, "grad_norm": 0.40164515376091003, "learning_rate": 0.0002, "epoch": 1.6310892172961138, "step": 1490}, {"loss": 0.9523, "grad_norm": 0.3214994966983795, "learning_rate": 0.0002, "epoch": 1.6420361247947455, "step": 1500}, {"loss": 1.0161, "grad_norm": 0.3727897107601166, "learning_rate": 0.0002, "epoch": 1.6529830322933772, "step": 1510}, {"loss": 1.0443, "grad_norm": 0.3817640542984009, "learning_rate": 0.0002, "epoch": 1.6639299397920086, "step": 1520}, {"loss": 1.0511, "grad_norm": 0.5592136979103088, "learning_rate": 0.0002, "epoch": 1.6748768472906403, "step": 1530}, {"loss": 0.9682, "grad_norm": 0.44636598229408264, "learning_rate": 0.0002, "epoch": 1.685823754789272, "step": 1540}, {"loss": 1.033, "grad_norm": 0.40441709756851196, "learning_rate": 0.0002, "epoch": 1.6967706622879035, "step": 1550}, {"loss": 1.0857, "grad_norm": 0.3243522644042969, "learning_rate": 0.0002, "epoch": 1.7077175697865354, "step": 1560}, {"loss": 0.9482, "grad_norm": 0.34277570247650146, "learning_rate": 0.0002, "epoch": 1.718664477285167, "step": 1570}, {"loss": 0.974, "grad_norm": 0.3279995024204254, "learning_rate": 0.0002, "epoch": 1.7296113847837986, "step": 1580}, {"loss": 0.9414, "grad_norm": 0.41968777775764465, "learning_rate": 0.0002, "epoch": 1.7405582922824303, "step": 1590}, {"loss": 0.9768, "grad_norm": 0.39464613795280457, "learning_rate": 0.0002, "epoch": 1.7515051997810618, "step": 1600}, {"loss": 1.0347, "grad_norm": 0.3839009404182434, "learning_rate": 0.0002, "epoch": 1.7624521072796935, "step": 1610}, {"loss": 0.9195, "grad_norm": 0.3250715434551239, "learning_rate": 0.0002, "epoch": 1.7733990147783252, "step": 1620}, {"loss": 1.0049, "grad_norm": 0.5166561007499695, "learning_rate": 0.0002, "epoch": 1.7843459222769567, "step": 1630}, {"loss": 1.0364, "grad_norm": 0.4115183353424072, "learning_rate": 0.0002, "epoch": 1.7952928297755884, "step": 1640}, {"loss": 1.0248, "grad_norm": 0.373780220746994, "learning_rate": 0.0002, "epoch": 1.80623973727422, "step": 1650}, {"loss": 1.0984, "grad_norm": 0.49697014689445496, "learning_rate": 0.0002, "epoch": 1.8171866447728515, "step": 1660}, {"loss": 1.0089, "grad_norm": 1.0308938026428223, "learning_rate": 0.0002, "epoch": 1.8281335522714834, "step": 1670}, {"loss": 1.0853, "grad_norm": 0.4851366877555847, "learning_rate": 0.0002, "epoch": 1.839080459770115, "step": 1680}, {"loss": 0.9533, "grad_norm": 0.3262481391429901, "learning_rate": 0.0002, "epoch": 1.8500273672687466, "step": 1690}, {"loss": 1.048, "grad_norm": 0.6904496550559998, "learning_rate": 0.0002, "epoch": 1.8609742747673783, "step": 1700}, {"loss": 1.0577, "grad_norm": 0.49789851903915405, "learning_rate": 0.0002, "epoch": 1.8719211822660098, "step": 1710}, {"loss": 1.0258, "grad_norm": 0.3035794198513031, "learning_rate": 0.0002, "epoch": 1.8828680897646415, "step": 1720}, {"loss": 0.9916, "grad_norm": 0.4588414430618286, "learning_rate": 0.0002, "epoch": 1.8938149972632732, "step": 1730}, {"loss": 0.9526, "grad_norm": 0.4313034117221832, "learning_rate": 0.0002, "epoch": 1.9047619047619047, "step": 1740}, {"loss": 1.0857, "grad_norm": 0.38562044501304626, "learning_rate": 0.0002, "epoch": 1.9157088122605364, "step": 1750}, {"loss": 0.9763, "grad_norm": 0.46947410702705383, "learning_rate": 0.0002, "epoch": 1.926655719759168, "step": 1760}, {"loss": 0.9967, "grad_norm": 0.3848404884338379, "learning_rate": 0.0002, "epoch": 1.9376026272577995, "step": 1770}, {"loss": 1.0474, "grad_norm": 0.30422744154930115, "learning_rate": 0.0002, "epoch": 1.9485495347564314, "step": 1780}, {"loss": 1.022, "grad_norm": 0.41100990772247314, "learning_rate": 0.0002, "epoch": 1.959496442255063, "step": 1790}, {"loss": 1.003, "grad_norm": 0.3492335081100464, "learning_rate": 0.0002, "epoch": 1.9704433497536946, "step": 1800}, {"loss": 0.9891, "grad_norm": 0.364577978849411, "learning_rate": 0.0002, "epoch": 1.9813902572523263, "step": 1810}, {"loss": 1.0699, "grad_norm": 0.4312075674533844, "learning_rate": 0.0002, "epoch": 1.9923371647509578, "step": 1820}, {"eval_loss": 1.14472496509552, "eval_runtime": 46.0786, "eval_samples_per_second": 9.462, "eval_steps_per_second": 1.194, "epoch": 2.0, "step": 1827}, {"loss": 1.0218, "grad_norm": 0.5989689230918884, "learning_rate": 0.0002, "epoch": 2.0032840722495897, "step": 1830}, {"loss": 0.9384, "grad_norm": 0.49720922112464905, "learning_rate": 0.0002, "epoch": 2.014230979748221, "step": 1840}, {"loss": 0.8482, "grad_norm": 0.42675456404685974, "learning_rate": 0.0002, "epoch": 2.0251778872468527, "step": 1850}, {"loss": 0.8487, "grad_norm": 0.4637208580970764, "learning_rate": 0.0002, "epoch": 2.0361247947454846, "step": 1860}, {"loss": 0.8212, "grad_norm": 0.8329976797103882, "learning_rate": 0.0002, "epoch": 2.047071702244116, "step": 1870}, {"loss": 0.9547, "grad_norm": 0.7869427800178528, "learning_rate": 0.0002, "epoch": 2.0580186097427475, "step": 1880}, {"loss": 0.9351, "grad_norm": 0.4927455186843872, "learning_rate": 0.0002, "epoch": 2.0689655172413794, "step": 1890}, {"loss": 0.8652, "grad_norm": 0.6264246702194214, "learning_rate": 0.0002, "epoch": 2.079912424740011, "step": 1900}, {"loss": 0.9095, "grad_norm": 1.1164122819900513, "learning_rate": 0.0002, "epoch": 2.0908593322386424, "step": 1910}, {"loss": 0.7823, "grad_norm": 0.5283981561660767, "learning_rate": 0.0002, "epoch": 2.1018062397372743, "step": 1920}, {"loss": 0.8065, "grad_norm": 0.45621731877326965, "learning_rate": 0.0002, "epoch": 2.112753147235906, "step": 1930}, {"loss": 0.9184, "grad_norm": 1.381791591644287, "learning_rate": 0.0002, "epoch": 2.1237000547345373, "step": 1940}, {"loss": 0.9006, "grad_norm": 0.5151259899139404, "learning_rate": 0.0002, "epoch": 2.134646962233169, "step": 1950}, {"loss": 0.8436, "grad_norm": 0.9806339740753174, "learning_rate": 0.0002, "epoch": 2.1455938697318007, "step": 1960}, {"loss": 0.8749, "grad_norm": 0.4734154939651489, "learning_rate": 0.0002, "epoch": 2.1565407772304326, "step": 1970}, {"loss": 0.9172, "grad_norm": 0.9553168416023254, "learning_rate": 0.0002, "epoch": 2.167487684729064, "step": 1980}, {"loss": 0.8047, "grad_norm": 0.5895838141441345, "learning_rate": 0.0002, "epoch": 2.1784345922276955, "step": 1990}, {"loss": 0.7841, "grad_norm": 0.4488855302333832, "learning_rate": 0.0002, "epoch": 2.1893814997263275, "step": 2000}, {"loss": 0.8205, "grad_norm": 1.0760235786437988, "learning_rate": 0.0002, "epoch": 2.200328407224959, "step": 2010}, {"loss": 0.7923, "grad_norm": 0.5038785338401794, "learning_rate": 0.0002, "epoch": 2.2112753147235904, "step": 2020}, {"loss": 0.8973, "grad_norm": 0.59819495677948, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 2030}, {"loss": 0.8009, "grad_norm": 0.5012075304985046, "learning_rate": 0.0002, "epoch": 2.233169129720854, "step": 2040}, {"loss": 0.9087, "grad_norm": 0.44978439807891846, "learning_rate": 0.0002, "epoch": 2.2441160372194853, "step": 2050}, {"loss": 0.823, "grad_norm": 0.5350462198257446, "learning_rate": 0.0002, "epoch": 2.255062944718117, "step": 2060}, {"loss": 0.8335, "grad_norm": 0.6020669937133789, "learning_rate": 0.0002, "epoch": 2.2660098522167487, "step": 2070}, {"loss": 0.8023, "grad_norm": 0.5246821045875549, "learning_rate": 0.0002, "epoch": 2.2769567597153806, "step": 2080}, {"loss": 0.8984, "grad_norm": 0.5711268782615662, "learning_rate": 0.0002, "epoch": 2.287903667214012, "step": 2090}, {"loss": 0.9093, "grad_norm": 0.617317259311676, "learning_rate": 0.0002, "epoch": 2.2988505747126435, "step": 2100}, {"loss": 0.8311, "grad_norm": 0.8608947396278381, "learning_rate": 0.0002, "epoch": 2.3097974822112755, "step": 2110}, {"loss": 0.7839, "grad_norm": 0.4739076793193817, "learning_rate": 0.0002, "epoch": 2.320744389709907, "step": 2120}, {"loss": 0.84, "grad_norm": 0.5538856983184814, "learning_rate": 0.0002, "epoch": 2.3316912972085384, "step": 2130}, {"loss": 0.8994, "grad_norm": 0.6064935326576233, "learning_rate": 0.0002, "epoch": 2.3426382047071703, "step": 2140}, {"loss": 0.7765, "grad_norm": 0.5019068121910095, "learning_rate": 0.0002, "epoch": 2.353585112205802, "step": 2150}, {"loss": 0.9576, "grad_norm": 0.45340514183044434, "learning_rate": 0.0002, "epoch": 2.3645320197044333, "step": 2160}, {"loss": 0.9254, "grad_norm": 0.7347203493118286, "learning_rate": 0.0002, "epoch": 2.375478927203065, "step": 2170}, {"loss": 0.8507, "grad_norm": 0.46922534704208374, "learning_rate": 0.0002, "epoch": 2.3864258347016967, "step": 2180}, {"loss": 0.8325, "grad_norm": 0.5507845878601074, "learning_rate": 0.0002, "epoch": 2.3973727422003286, "step": 2190}, {"loss": 0.8178, "grad_norm": 0.5621911883354187, "learning_rate": 0.0002, "epoch": 2.40831964969896, "step": 2200}, {"loss": 0.8569, "grad_norm": 0.5023514032363892, "learning_rate": 0.0002, "epoch": 2.4192665571975915, "step": 2210}, {"loss": 0.8355, "grad_norm": 0.6124861240386963, "learning_rate": 0.0002, "epoch": 2.4302134646962235, "step": 2220}, {"loss": 0.885, "grad_norm": 0.49614205956459045, "learning_rate": 0.0002, "epoch": 2.441160372194855, "step": 2230}, {"loss": 0.8008, "grad_norm": 0.6477900743484497, "learning_rate": 0.0002, "epoch": 2.4521072796934864, "step": 2240}, {"loss": 0.8622, "grad_norm": 0.5868843793869019, "learning_rate": 0.0002, "epoch": 2.4630541871921183, "step": 2250}, {"loss": 0.8498, "grad_norm": 0.4364610016345978, "learning_rate": 0.0002, "epoch": 2.47400109469075, "step": 2260}, {"loss": 0.8378, "grad_norm": 0.5792964696884155, "learning_rate": 0.0002, "epoch": 2.4849480021893813, "step": 2270}, {"loss": 0.8743, "grad_norm": 0.5421269536018372, "learning_rate": 0.0002, "epoch": 2.495894909688013, "step": 2280}, {"loss": 0.9637, "grad_norm": 0.5525493025779724, "learning_rate": 0.0002, "epoch": 2.5068418171866447, "step": 2290}, {"loss": 0.8075, "grad_norm": 0.6463850140571594, "learning_rate": 0.0002, "epoch": 2.5177887246852766, "step": 2300}, {"loss": 0.8591, "grad_norm": 0.6861311793327332, "learning_rate": 0.0002, "epoch": 2.528735632183908, "step": 2310}, {"loss": 0.9287, "grad_norm": 0.5563555359840393, "learning_rate": 0.0002, "epoch": 2.5396825396825395, "step": 2320}, {"loss": 0.945, "grad_norm": 0.5721169114112854, "learning_rate": 0.0002, "epoch": 2.5506294471811715, "step": 2330}, {"loss": 0.8271, "grad_norm": 0.5258274674415588, "learning_rate": 0.0002, "epoch": 2.561576354679803, "step": 2340}, {"loss": 0.8515, "grad_norm": 0.7057380676269531, "learning_rate": 0.0002, "epoch": 2.572523262178435, "step": 2350}, {"loss": 0.8615, "grad_norm": 0.6869027614593506, "learning_rate": 0.0002, "epoch": 2.5834701696770663, "step": 2360}, {"loss": 0.8043, "grad_norm": 0.4960809648036957, "learning_rate": 0.0002, "epoch": 2.594417077175698, "step": 2370}, {"loss": 0.8476, "grad_norm": 0.9288380146026611, "learning_rate": 0.0002, "epoch": 2.6053639846743293, "step": 2380}, {"loss": 0.873, "grad_norm": 0.3765334188938141, "learning_rate": 0.0002, "epoch": 2.616310892172961, "step": 2390}, {"loss": 0.8764, "grad_norm": 0.7487865686416626, "learning_rate": 0.0002, "epoch": 2.6272577996715927, "step": 2400}, {"loss": 0.7577, "grad_norm": 0.6141156554222107, "learning_rate": 0.0002, "epoch": 2.6382047071702246, "step": 2410}, {"loss": 0.8534, "grad_norm": 0.8420507907867432, "learning_rate": 0.0002, "epoch": 2.649151614668856, "step": 2420}, {"loss": 0.8311, "grad_norm": 0.53386390209198, "learning_rate": 0.0002, "epoch": 2.6600985221674875, "step": 2430}, {"loss": 0.8486, "grad_norm": 0.5520607233047485, "learning_rate": 0.0002, "epoch": 2.6710454296661195, "step": 2440}, {"loss": 0.8686, "grad_norm": 0.5337599515914917, "learning_rate": 0.0002, "epoch": 2.681992337164751, "step": 2450}, {"loss": 0.792, "grad_norm": 0.48790836334228516, "learning_rate": 0.0002, "epoch": 2.692939244663383, "step": 2460}, {"loss": 0.7864, "grad_norm": 0.8287786245346069, "learning_rate": 0.0002, "epoch": 2.7038861521620143, "step": 2470}, {"loss": 0.8244, "grad_norm": 0.5876168608665466, "learning_rate": 0.0002, "epoch": 2.714833059660646, "step": 2480}, {"loss": 0.8773, "grad_norm": 0.5206760764122009, "learning_rate": 0.0002, "epoch": 2.7257799671592773, "step": 2490}, {"loss": 0.8097, "grad_norm": 0.5619136691093445, "learning_rate": 0.0002, "epoch": 2.736726874657909, "step": 2500}, {"loss": 0.8377, "grad_norm": 0.5614883899688721, "learning_rate": 0.0002, "epoch": 2.7476737821565407, "step": 2510}, {"loss": 0.8817, "grad_norm": 0.6157700419425964, "learning_rate": 0.0002, "epoch": 2.7586206896551726, "step": 2520}, {"loss": 0.846, "grad_norm": 0.5529953837394714, "learning_rate": 0.0002, "epoch": 2.769567597153804, "step": 2530}, {"loss": 0.8881, "grad_norm": 0.6731224060058594, "learning_rate": 0.0002, "epoch": 2.7805145046524355, "step": 2540}, {"loss": 0.9125, "grad_norm": 0.6960386633872986, "learning_rate": 0.0002, "epoch": 2.7914614121510675, "step": 2550}, {"loss": 0.7823, "grad_norm": 0.5203493237495422, "learning_rate": 0.0002, "epoch": 2.802408319649699, "step": 2560}, {"loss": 0.7951, "grad_norm": 1.036837100982666, "learning_rate": 0.0002, "epoch": 2.813355227148331, "step": 2570}, {"loss": 0.8427, "grad_norm": 0.6125805377960205, "learning_rate": 0.0002, "epoch": 2.8243021346469623, "step": 2580}, {"loss": 0.8939, "grad_norm": 0.6298092603683472, "learning_rate": 0.0002, "epoch": 2.835249042145594, "step": 2590}, {"loss": 0.8845, "grad_norm": 0.5882203578948975, "learning_rate": 0.0002, "epoch": 2.8461959496442253, "step": 2600}, {"loss": 0.8921, "grad_norm": 0.8619399666786194, "learning_rate": 0.0002, "epoch": 2.857142857142857, "step": 2610}, {"loss": 0.8213, "grad_norm": 0.4722687304019928, "learning_rate": 0.0002, "epoch": 2.8680897646414887, "step": 2620}, {"loss": 0.9147, "grad_norm": 0.47399574518203735, "learning_rate": 0.0002, "epoch": 2.8790366721401206, "step": 2630}, {"loss": 0.8325, "grad_norm": 0.5639172792434692, "learning_rate": 0.0002, "epoch": 2.889983579638752, "step": 2640}, {"loss": 0.8653, "grad_norm": 0.4676816761493683, "learning_rate": 0.0002, "epoch": 2.9009304871373836, "step": 2650}, {"loss": 0.8966, "grad_norm": 0.6906291246414185, "learning_rate": 0.0002, "epoch": 2.9118773946360155, "step": 2660}, {"loss": 0.8966, "grad_norm": 0.4369746148586273, "learning_rate": 0.0002, "epoch": 2.922824302134647, "step": 2670}, {"loss": 0.9173, "grad_norm": 0.46423083543777466, "learning_rate": 0.0002, "epoch": 2.933771209633279, "step": 2680}, {"loss": 0.7961, "grad_norm": 0.5700525045394897, "learning_rate": 0.0002, "epoch": 2.9447181171319103, "step": 2690}, {"loss": 0.8172, "grad_norm": 0.6221476793289185, "learning_rate": 0.0002, "epoch": 2.955665024630542, "step": 2700}, {"loss": 0.8538, "grad_norm": 0.6102682948112488, "learning_rate": 0.0002, "epoch": 2.9666119321291733, "step": 2710}, {"loss": 0.9779, "grad_norm": 0.5317878723144531, "learning_rate": 0.0002, "epoch": 2.977558839627805, "step": 2720}, {"loss": 0.9314, "grad_norm": 0.4438510835170746, "learning_rate": 0.0002, "epoch": 2.9885057471264367, "step": 2730}, {"loss": 0.8553, "grad_norm": 0.5022130012512207, "learning_rate": 0.0002, "epoch": 2.9994526546250686, "step": 2740}, {"eval_loss": 1.1722838878631592, "eval_runtime": 46.0829, "eval_samples_per_second": 9.461, "eval_steps_per_second": 1.194, "epoch": 2.9994526546250686, "step": 2740}, {"loss": 0.6443, "grad_norm": 0.6384502053260803, "learning_rate": 0.0002, "epoch": 3.0103995621237, "step": 2750}, {"loss": 0.7123, "grad_norm": 0.9928722381591797, "learning_rate": 0.0002, "epoch": 3.0213464696223316, "step": 2760}, {"loss": 0.6045, "grad_norm": 0.7813051342964172, "learning_rate": 0.0002, "epoch": 3.0322933771209635, "step": 2770}, {"loss": 0.6042, "grad_norm": 1.0202556848526, "learning_rate": 0.0002, "epoch": 3.043240284619595, "step": 2780}, {"loss": 0.6356, "grad_norm": 0.7581062316894531, "learning_rate": 0.0002, "epoch": 3.0541871921182264, "step": 2790}, {"loss": 0.6349, "grad_norm": 0.6252710223197937, "learning_rate": 0.0002, "epoch": 3.0651340996168583, "step": 2800}, {"loss": 0.645, "grad_norm": 0.7738662958145142, "learning_rate": 0.0002, "epoch": 3.07608100711549, "step": 2810}, {"loss": 0.627, "grad_norm": 0.7381885051727295, "learning_rate": 0.0002, "epoch": 3.0870279146141213, "step": 2820}, {"loss": 0.6371, "grad_norm": 0.9197564721107483, "learning_rate": 0.0002, "epoch": 3.097974822112753, "step": 2830}, {"loss": 0.723, "grad_norm": 1.000976800918579, "learning_rate": 0.0002, "epoch": 3.1089217296113847, "step": 2840}, {"loss": 0.6631, "grad_norm": 0.7559131383895874, "learning_rate": 0.0002, "epoch": 3.1198686371100166, "step": 2850}, {"loss": 0.6252, "grad_norm": 0.7213780879974365, "learning_rate": 0.0002, "epoch": 3.130815544608648, "step": 2860}, {"loss": 0.6501, "grad_norm": 0.945939838886261, "learning_rate": 0.0002, "epoch": 3.1417624521072796, "step": 2870}, {"loss": 0.6129, "grad_norm": 0.7277454137802124, "learning_rate": 0.0002, "epoch": 3.1527093596059115, "step": 2880}, {"loss": 0.6423, "grad_norm": 0.762026846408844, "learning_rate": 0.0002, "epoch": 3.163656267104543, "step": 2890}, {"loss": 0.5332, "grad_norm": 0.6471221446990967, "learning_rate": 0.0002, "epoch": 3.1746031746031744, "step": 2900}, {"loss": 0.7981, "grad_norm": 0.6018978357315063, "learning_rate": 0.0002, "epoch": 3.1855500821018063, "step": 2910}, {"loss": 0.7274, "grad_norm": 0.8607320785522461, "learning_rate": 0.0002, "epoch": 3.196496989600438, "step": 2920}, {"loss": 0.6139, "grad_norm": 0.8854126334190369, "learning_rate": 0.0002, "epoch": 3.2074438970990693, "step": 2930}, {"loss": 0.6485, "grad_norm": 0.6620870232582092, "learning_rate": 0.0002, "epoch": 3.218390804597701, "step": 2940}, {"loss": 0.6969, "grad_norm": 0.7377511858940125, "learning_rate": 0.0002, "epoch": 3.2293377120963327, "step": 2950}, {"loss": 0.6798, "grad_norm": 0.7803301811218262, "learning_rate": 0.0002, "epoch": 3.2402846195949646, "step": 2960}, {"loss": 0.6697, "grad_norm": 0.834061861038208, "learning_rate": 0.0002, "epoch": 3.251231527093596, "step": 2970}, {"loss": 0.6894, "grad_norm": 0.8496041893959045, "learning_rate": 0.0002, "epoch": 3.2621784345922276, "step": 2980}, {"loss": 0.6591, "grad_norm": 0.7967984676361084, "learning_rate": 0.0002, "epoch": 3.2731253420908595, "step": 2990}, {"loss": 0.7266, "grad_norm": 1.0207016468048096, "learning_rate": 0.0002, "epoch": 3.284072249589491, "step": 3000}, {"loss": 0.6586, "grad_norm": 0.7855771780014038, "learning_rate": 0.0002, "epoch": 3.2950191570881224, "step": 3010}, {"loss": 0.5711, "grad_norm": 0.9427546858787537, "learning_rate": 0.0002, "epoch": 3.3059660645867543, "step": 3020}, {"loss": 0.6277, "grad_norm": 0.823542594909668, "learning_rate": 0.0002, "epoch": 3.316912972085386, "step": 3030}, {"loss": 0.7109, "grad_norm": 0.9826635122299194, "learning_rate": 0.0002, "epoch": 3.3278598795840173, "step": 3040}, {"loss": 0.6564, "grad_norm": 0.7259827852249146, "learning_rate": 0.0002, "epoch": 3.338806787082649, "step": 3050}, {"loss": 0.653, "grad_norm": 0.7774739861488342, "learning_rate": 0.0002, "epoch": 3.3497536945812807, "step": 3060}, {"loss": 0.7529, "grad_norm": 0.7394293546676636, "learning_rate": 0.0002, "epoch": 3.3607006020799126, "step": 3070}, {"loss": 0.5987, "grad_norm": 0.9017578959465027, "learning_rate": 0.0002, "epoch": 3.371647509578544, "step": 3080}, {"loss": 0.6953, "grad_norm": 0.7451054453849792, "learning_rate": 0.0002, "epoch": 3.3825944170771756, "step": 3090}, {"loss": 0.6759, "grad_norm": 0.7321506142616272, "learning_rate": 0.0002, "epoch": 3.3935413245758075, "step": 3100}, {"loss": 0.6555, "grad_norm": 0.6721828579902649, "learning_rate": 0.0002, "epoch": 3.404488232074439, "step": 3110}, {"loss": 0.6559, "grad_norm": 0.774022102355957, "learning_rate": 0.0002, "epoch": 3.4154351395730704, "step": 3120}, {"loss": 0.7449, "grad_norm": 0.9143537282943726, "learning_rate": 0.0002, "epoch": 3.4263820470717024, "step": 3130}, {"loss": 0.6899, "grad_norm": 1.226087212562561, "learning_rate": 0.0002, "epoch": 3.437328954570334, "step": 3140}, {"loss": 0.6719, "grad_norm": 0.7545496225357056, "learning_rate": 0.0002, "epoch": 3.4482758620689653, "step": 3150}, {"loss": 0.6153, "grad_norm": 0.6515635848045349, "learning_rate": 0.0002, "epoch": 3.4592227695675972, "step": 3160}, {"loss": 0.6926, "grad_norm": 0.9297090172767639, "learning_rate": 0.0002, "epoch": 3.4701696770662287, "step": 3170}, {"loss": 0.6071, "grad_norm": 1.0130730867385864, "learning_rate": 0.0002, "epoch": 3.4811165845648606, "step": 3180}, {"loss": 0.5959, "grad_norm": 0.7654589414596558, "learning_rate": 0.0002, "epoch": 3.492063492063492, "step": 3190}, {"loss": 0.7401, "grad_norm": 0.9954977631568909, "learning_rate": 0.0002, "epoch": 3.5030103995621236, "step": 3200}, {"loss": 0.6661, "grad_norm": 0.6027487516403198, "learning_rate": 0.0002, "epoch": 3.5139573070607555, "step": 3210}, {"loss": 0.6963, "grad_norm": 0.741770327091217, "learning_rate": 0.0002, "epoch": 3.524904214559387, "step": 3220}, {"loss": 0.8112, "grad_norm": 1.0534909963607788, "learning_rate": 0.0002, "epoch": 3.535851122058019, "step": 3230}, {"loss": 0.6813, "grad_norm": 0.937772274017334, "learning_rate": 0.0002, "epoch": 3.5467980295566504, "step": 3240}, {"loss": 0.6681, "grad_norm": 0.8504213690757751, "learning_rate": 0.0002, "epoch": 3.557744937055282, "step": 3250}, {"loss": 0.6436, "grad_norm": 0.7755007147789001, "learning_rate": 0.0002, "epoch": 3.5686918445539133, "step": 3260}, {"loss": 0.6213, "grad_norm": 1.0193358659744263, "learning_rate": 0.0002, "epoch": 3.5796387520525452, "step": 3270}, {"loss": 0.671, "grad_norm": 0.8440536856651306, "learning_rate": 0.0002, "epoch": 3.5905856595511767, "step": 3280}, {"loss": 0.6859, "grad_norm": 0.6195939183235168, "learning_rate": 0.0002, "epoch": 3.6015325670498086, "step": 3290}, {"loss": 0.7446, "grad_norm": 0.8608590960502625, "learning_rate": 0.0002, "epoch": 3.61247947454844, "step": 3300}, {"loss": 0.7301, "grad_norm": 0.6772327423095703, "learning_rate": 0.0002, "epoch": 3.6234263820470716, "step": 3310}, {"loss": 0.6298, "grad_norm": 0.8031839728355408, "learning_rate": 0.0002, "epoch": 3.6343732895457035, "step": 3320}, {"loss": 0.7041, "grad_norm": 0.6080502271652222, "learning_rate": 0.0002, "epoch": 3.645320197044335, "step": 3330}, {"loss": 0.7431, "grad_norm": 0.8007240891456604, "learning_rate": 0.0002, "epoch": 3.656267104542967, "step": 3340}, {"loss": 0.7446, "grad_norm": 0.8060704469680786, "learning_rate": 0.0002, "epoch": 3.6672140120415984, "step": 3350}, {"loss": 0.6304, "grad_norm": 0.7547586560249329, "learning_rate": 0.0002, "epoch": 3.67816091954023, "step": 3360}, {"loss": 0.7066, "grad_norm": 0.686851978302002, "learning_rate": 0.0002, "epoch": 3.6891078270388613, "step": 3370}, {"loss": 0.6748, "grad_norm": 0.9429075717926025, "learning_rate": 0.0002, "epoch": 3.7000547345374932, "step": 3380}, {"loss": 0.6673, "grad_norm": 0.7283591032028198, "learning_rate": 0.0002, "epoch": 3.7110016420361247, "step": 3390}, {"loss": 0.7502, "grad_norm": 0.8323085904121399, "learning_rate": 0.0002, "epoch": 3.7219485495347566, "step": 3400}, {"loss": 0.7779, "grad_norm": 0.8529590964317322, "learning_rate": 0.0002, "epoch": 3.732895457033388, "step": 3410}, {"loss": 0.6555, "grad_norm": 0.731752872467041, "learning_rate": 0.0002, "epoch": 3.7438423645320196, "step": 3420}, {"loss": 0.6928, "grad_norm": 0.8572278618812561, "learning_rate": 0.0002, "epoch": 3.7547892720306515, "step": 3430}, {"loss": 0.6215, "grad_norm": 0.7408691048622131, "learning_rate": 0.0002, "epoch": 3.765736179529283, "step": 3440}, {"loss": 0.622, "grad_norm": 0.7470445036888123, "learning_rate": 0.0002, "epoch": 3.776683087027915, "step": 3450}, {"loss": 0.7241, "grad_norm": 0.6806244254112244, "learning_rate": 0.0002, "epoch": 3.7876299945265464, "step": 3460}, {"loss": 0.7739, "grad_norm": 0.9129069447517395, "learning_rate": 0.0002, "epoch": 3.798576902025178, "step": 3470}, {"loss": 0.6826, "grad_norm": 0.8717501759529114, "learning_rate": 0.0002, "epoch": 3.8095238095238093, "step": 3480}, {"loss": 0.6188, "grad_norm": 0.6761979460716248, "learning_rate": 0.0002, "epoch": 3.8204707170224412, "step": 3490}, {"loss": 0.7601, "grad_norm": 1.0054380893707275, "learning_rate": 0.0002, "epoch": 3.8314176245210727, "step": 3500}, {"loss": 0.623, "grad_norm": 1.1224009990692139, "learning_rate": 0.0002, "epoch": 3.8423645320197046, "step": 3510}, {"loss": 0.6918, "grad_norm": 0.8997692465782166, "learning_rate": 0.0002, "epoch": 3.853311439518336, "step": 3520}, {"loss": 0.6357, "grad_norm": 1.0086902379989624, "learning_rate": 0.0002, "epoch": 3.8642583470169676, "step": 3530}, {"loss": 0.6379, "grad_norm": 0.772739589214325, "learning_rate": 0.0002, "epoch": 3.8752052545155995, "step": 3540}, {"loss": 0.7423, "grad_norm": 1.211774230003357, "learning_rate": 0.0002, "epoch": 3.886152162014231, "step": 3550}, {"loss": 0.7321, "grad_norm": 0.9572356939315796, "learning_rate": 0.0002, "epoch": 3.897099069512863, "step": 3560}, {"loss": 0.6836, "grad_norm": 0.7887842655181885, "learning_rate": 0.0002, "epoch": 3.9080459770114944, "step": 3570}, {"loss": 0.7576, "grad_norm": 0.7308389544487, "learning_rate": 0.0002, "epoch": 3.918992884510126, "step": 3580}, {"loss": 0.6001, "grad_norm": 1.0182650089263916, "learning_rate": 0.0002, "epoch": 3.9299397920087573, "step": 3590}, {"loss": 0.6942, "grad_norm": 0.8000147342681885, "learning_rate": 0.0002, "epoch": 3.9408866995073892, "step": 3600}, {"loss": 0.6244, "grad_norm": 0.7385728359222412, "learning_rate": 0.0002, "epoch": 3.9518336070060207, "step": 3610}, {"loss": 0.6718, "grad_norm": 0.9233261942863464, "learning_rate": 0.0002, "epoch": 3.9627805145046526, "step": 3620}, {"loss": 0.6508, "grad_norm": 0.8486751914024353, "learning_rate": 0.0002, "epoch": 3.973727422003284, "step": 3630}, {"loss": 0.6928, "grad_norm": 0.7593663334846497, "learning_rate": 0.0002, "epoch": 3.9846743295019156, "step": 3640}, {"loss": 0.6847, "grad_norm": 0.7885415554046631, "learning_rate": 0.0002, "epoch": 3.9956212370005475, "step": 3650}, {"eval_loss": 1.250312328338623, "eval_runtime": 46.0842, "eval_samples_per_second": 9.461, "eval_steps_per_second": 1.193, "epoch": 4.0, "step": 3654}, {"loss": 0.5547, "grad_norm": 0.6591703295707703, "learning_rate": 0.0002, "epoch": 4.006568144499179, "step": 3660}, {"loss": 0.5301, "grad_norm": 1.36927330493927, "learning_rate": 0.0002, "epoch": 4.017515051997811, "step": 3670}, {"loss": 0.4466, "grad_norm": 0.8106328845024109, "learning_rate": 0.0002, "epoch": 4.028461959496442, "step": 3680}, {"loss": 0.4861, "grad_norm": 0.7592712044715881, "learning_rate": 0.0002, "epoch": 4.039408866995074, "step": 3690}, {"loss": 0.5103, "grad_norm": 0.9518909454345703, "learning_rate": 0.0002, "epoch": 4.050355774493705, "step": 3700}, {"loss": 0.4638, "grad_norm": 0.7805967330932617, "learning_rate": 0.0002, "epoch": 4.061302681992337, "step": 3710}, {"loss": 0.4556, "grad_norm": 1.3146334886550903, "learning_rate": 0.0002, "epoch": 4.072249589490969, "step": 3720}, {"loss": 0.5635, "grad_norm": 1.1611138582229614, "learning_rate": 0.0002, "epoch": 4.083196496989601, "step": 3730}, {"loss": 0.3845, "grad_norm": 0.8173232078552246, "learning_rate": 0.0002, "epoch": 4.094143404488232, "step": 3740}, {"loss": 0.4911, "grad_norm": 0.7848323583602905, "learning_rate": 0.0002, "epoch": 4.105090311986864, "step": 3750}, {"loss": 0.4519, "grad_norm": 1.3183201551437378, "learning_rate": 0.0002, "epoch": 4.116037219485495, "step": 3760}, {"loss": 0.5083, "grad_norm": 1.1936529874801636, "learning_rate": 0.0002, "epoch": 4.1269841269841265, "step": 3770}, {"loss": 0.5208, "grad_norm": 1.1078993082046509, "learning_rate": 0.0002, "epoch": 4.137931034482759, "step": 3780}, {"loss": 0.5928, "grad_norm": 1.107743263244629, "learning_rate": 0.0002, "epoch": 4.14887794198139, "step": 3790}, {"loss": 0.5112, "grad_norm": 0.7801875472068787, "learning_rate": 0.0002, "epoch": 4.159824849480022, "step": 3800}, {"loss": 0.4896, "grad_norm": 1.1328117847442627, "learning_rate": 0.0002, "epoch": 4.170771756978653, "step": 3810}, {"loss": 0.5645, "grad_norm": 1.4232193231582642, "learning_rate": 0.0002, "epoch": 4.181718664477285, "step": 3820}, {"loss": 0.5049, "grad_norm": 1.557416558265686, "learning_rate": 0.0002, "epoch": 4.192665571975917, "step": 3830}, {"loss": 0.4863, "grad_norm": 1.042923092842102, "learning_rate": 0.0002, "epoch": 4.203612479474549, "step": 3840}, {"loss": 0.3751, "grad_norm": 1.1801949739456177, "learning_rate": 0.0002, "epoch": 4.21455938697318, "step": 3850}, {"loss": 0.5063, "grad_norm": 0.9273753762245178, "learning_rate": 0.0002, "epoch": 4.225506294471812, "step": 3860}, {"loss": 0.5542, "grad_norm": 0.7681763768196106, "learning_rate": 0.0002, "epoch": 4.236453201970443, "step": 3870}, {"loss": 0.5971, "grad_norm": 0.9840841293334961, "learning_rate": 0.0002, "epoch": 4.2474001094690745, "step": 3880}, {"loss": 0.4648, "grad_norm": 1.0290725231170654, "learning_rate": 0.0002, "epoch": 4.258347016967707, "step": 3890}, {"loss": 0.4288, "grad_norm": 0.8059597611427307, "learning_rate": 0.0002, "epoch": 4.269293924466338, "step": 3900}, {"loss": 0.5103, "grad_norm": 0.9847467541694641, "learning_rate": 0.0002, "epoch": 4.28024083196497, "step": 3910}, {"loss": 0.4952, "grad_norm": 1.344044804573059, "learning_rate": 0.0002, "epoch": 4.291187739463601, "step": 3920}, {"loss": 0.4966, "grad_norm": 0.9174224138259888, "learning_rate": 0.0002, "epoch": 4.302134646962233, "step": 3930}, {"loss": 0.4944, "grad_norm": 1.1199711561203003, "learning_rate": 0.0002, "epoch": 4.313081554460865, "step": 3940}, {"loss": 0.4641, "grad_norm": 1.0120296478271484, "learning_rate": 0.0002, "epoch": 4.324028461959497, "step": 3950}, {"loss": 0.4723, "grad_norm": 1.091811180114746, "learning_rate": 0.0002, "epoch": 4.334975369458128, "step": 3960}, {"loss": 0.4627, "grad_norm": 1.0332133769989014, "learning_rate": 0.0002, "epoch": 4.34592227695676, "step": 3970}, {"loss": 0.4646, "grad_norm": 1.0785295963287354, "learning_rate": 0.0002, "epoch": 4.356869184455391, "step": 3980}, {"loss": 0.4909, "grad_norm": 1.0506969690322876, "learning_rate": 0.0002, "epoch": 4.3678160919540225, "step": 3990}, {"loss": 0.4776, "grad_norm": 1.047560691833496, "learning_rate": 0.0002, "epoch": 4.378762999452655, "step": 4000}, {"loss": 0.4549, "grad_norm": 0.9348800778388977, "learning_rate": 0.0002, "epoch": 4.389709906951286, "step": 4010}, {"loss": 0.5333, "grad_norm": 1.1563059091567993, "learning_rate": 0.0002, "epoch": 4.400656814449918, "step": 4020}, {"loss": 0.4952, "grad_norm": 1.001470923423767, "learning_rate": 0.0002, "epoch": 4.411603721948549, "step": 4030}, {"loss": 0.4972, "grad_norm": 1.309012532234192, "learning_rate": 0.0002, "epoch": 4.422550629447181, "step": 4040}, {"loss": 0.5078, "grad_norm": 0.7338925004005432, "learning_rate": 0.0002, "epoch": 4.433497536945813, "step": 4050}, {"loss": 0.4632, "grad_norm": 1.0398834943771362, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 4060}, {"loss": 0.6285, "grad_norm": 0.9728689193725586, "learning_rate": 0.0002, "epoch": 4.455391351943076, "step": 4070}, {"loss": 0.4778, "grad_norm": 1.247475028038025, "learning_rate": 0.0002, "epoch": 4.466338259441708, "step": 4080}, {"loss": 0.4813, "grad_norm": 1.1084578037261963, "learning_rate": 0.0002, "epoch": 4.477285166940339, "step": 4090}, {"loss": 0.5665, "grad_norm": 1.1619318723678589, "learning_rate": 0.0002, "epoch": 4.4882320744389705, "step": 4100}, {"loss": 0.5207, "grad_norm": 1.3456498384475708, "learning_rate": 0.0002, "epoch": 4.499178981937603, "step": 4110}, {"loss": 0.4876, "grad_norm": 0.9372991323471069, "learning_rate": 0.0002, "epoch": 4.510125889436234, "step": 4120}, {"loss": 0.5456, "grad_norm": 1.0071815252304077, "learning_rate": 0.0002, "epoch": 4.521072796934866, "step": 4130}, {"loss": 0.5589, "grad_norm": 1.190344214439392, "learning_rate": 0.0002, "epoch": 4.532019704433497, "step": 4140}, {"loss": 0.4852, "grad_norm": 0.9480887055397034, "learning_rate": 0.0002, "epoch": 4.542966611932129, "step": 4150}, {"loss": 0.5229, "grad_norm": 1.0252189636230469, "learning_rate": 0.0002, "epoch": 4.553913519430761, "step": 4160}, {"loss": 0.5253, "grad_norm": 0.7142013311386108, "learning_rate": 0.0002, "epoch": 4.564860426929393, "step": 4170}, {"loss": 0.4861, "grad_norm": 0.8937426805496216, "learning_rate": 0.0002, "epoch": 4.575807334428024, "step": 4180}, {"loss": 0.4773, "grad_norm": 0.8885005116462708, "learning_rate": 0.0002, "epoch": 4.586754241926656, "step": 4190}, {"loss": 0.4858, "grad_norm": 1.337663173675537, "learning_rate": 0.0002, "epoch": 4.597701149425287, "step": 4200}, {"loss": 0.5247, "grad_norm": 1.0475375652313232, "learning_rate": 0.0002, "epoch": 4.6086480569239185, "step": 4210}, {"loss": 0.5298, "grad_norm": 1.0081088542938232, "learning_rate": 0.0002, "epoch": 4.619594964422551, "step": 4220}, {"loss": 0.5042, "grad_norm": 0.7527595162391663, "learning_rate": 0.0002, "epoch": 4.630541871921182, "step": 4230}, {"loss": 0.5207, "grad_norm": 1.55559241771698, "learning_rate": 0.0002, "epoch": 4.641488779419814, "step": 4240}, {"loss": 0.5468, "grad_norm": 0.7967379689216614, "learning_rate": 0.0002, "epoch": 4.652435686918445, "step": 4250}, {"loss": 0.5328, "grad_norm": 0.898368775844574, "learning_rate": 0.0002, "epoch": 4.663382594417077, "step": 4260}, {"loss": 0.4706, "grad_norm": 1.1940776109695435, "learning_rate": 0.0002, "epoch": 4.674329501915709, "step": 4270}, {"loss": 0.5121, "grad_norm": 1.1817092895507812, "learning_rate": 0.0002, "epoch": 4.685276409414341, "step": 4280}, {"loss": 0.5758, "grad_norm": 0.9041520357131958, "learning_rate": 0.0002, "epoch": 4.696223316912972, "step": 4290}, {"loss": 0.5851, "grad_norm": 1.1280102729797363, "learning_rate": 0.0002, "epoch": 4.707170224411604, "step": 4300}, {"loss": 0.4891, "grad_norm": 1.357689619064331, "learning_rate": 0.0002, "epoch": 4.718117131910235, "step": 4310}, {"loss": 0.4704, "grad_norm": 1.056633472442627, "learning_rate": 0.0002, "epoch": 4.7290640394088665, "step": 4320}, {"loss": 0.5488, "grad_norm": 1.6520427465438843, "learning_rate": 0.0002, "epoch": 4.740010946907499, "step": 4330}, {"loss": 0.5131, "grad_norm": 1.153200626373291, "learning_rate": 0.0002, "epoch": 4.75095785440613, "step": 4340}, {"loss": 0.539, "grad_norm": 0.9346241354942322, "learning_rate": 0.0002, "epoch": 4.761904761904762, "step": 4350}, {"loss": 0.4941, "grad_norm": 0.8628455996513367, "learning_rate": 0.0002, "epoch": 4.772851669403393, "step": 4360}, {"loss": 0.5167, "grad_norm": 1.3843916654586792, "learning_rate": 0.0002, "epoch": 4.783798576902025, "step": 4370}, {"loss": 0.4683, "grad_norm": 1.035574197769165, "learning_rate": 0.0002, "epoch": 4.794745484400657, "step": 4380}, {"loss": 0.5162, "grad_norm": 1.1868361234664917, "learning_rate": 0.0002, "epoch": 4.805692391899289, "step": 4390}, {"loss": 0.534, "grad_norm": 1.1307647228240967, "learning_rate": 0.0002, "epoch": 4.81663929939792, "step": 4400}, {"loss": 0.5567, "grad_norm": 0.9787724614143372, "learning_rate": 0.0002, "epoch": 4.827586206896552, "step": 4410}, {"loss": 0.5185, "grad_norm": 1.0473824739456177, "learning_rate": 0.0002, "epoch": 4.838533114395183, "step": 4420}, {"loss": 0.6285, "grad_norm": 1.069069504737854, "learning_rate": 0.0002, "epoch": 4.8494800218938146, "step": 4430}, {"loss": 0.5267, "grad_norm": 1.4305680990219116, "learning_rate": 0.0002, "epoch": 4.860426929392447, "step": 4440}, {"loss": 0.5947, "grad_norm": 1.3679203987121582, "learning_rate": 0.0002, "epoch": 4.871373836891078, "step": 4450}, {"loss": 0.5135, "grad_norm": 0.8997844457626343, "learning_rate": 0.0002, "epoch": 4.88232074438971, "step": 4460}, {"loss": 0.5312, "grad_norm": 1.2758110761642456, "learning_rate": 0.0002, "epoch": 4.893267651888341, "step": 4470}, {"loss": 0.4914, "grad_norm": 0.8819465637207031, "learning_rate": 0.0002, "epoch": 4.904214559386973, "step": 4480}, {"loss": 0.5147, "grad_norm": 1.08329439163208, "learning_rate": 0.0002, "epoch": 4.915161466885605, "step": 4490}, {"loss": 0.5404, "grad_norm": 1.083461046218872, "learning_rate": 0.0002, "epoch": 4.926108374384237, "step": 4500}, {"loss": 0.5433, "grad_norm": 1.2387723922729492, "learning_rate": 0.0002, "epoch": 4.937055281882868, "step": 4510}, {"loss": 0.5624, "grad_norm": 0.8262293934822083, "learning_rate": 0.0002, "epoch": 4.9480021893815, "step": 4520}, {"loss": 0.504, "grad_norm": 1.2325191497802734, "learning_rate": 0.0002, "epoch": 4.958949096880131, "step": 4530}, {"loss": 0.5452, "grad_norm": 1.024614930152893, "learning_rate": 0.0002, "epoch": 4.9698960043787626, "step": 4540}, {"loss": 0.4752, "grad_norm": 1.3007521629333496, "learning_rate": 0.0002, "epoch": 4.980842911877395, "step": 4550}, {"loss": 0.4943, "grad_norm": 0.9823828339576721, "learning_rate": 0.0002, "epoch": 4.991789819376026, "step": 4560}, {"eval_loss": 1.3920727968215942, "eval_runtime": 46.0764, "eval_samples_per_second": 9.463, "eval_steps_per_second": 1.194, "epoch": 4.999452654625069, "step": 4567}, {"loss": 0.545, "grad_norm": 1.1478906869888306, "learning_rate": 0.0002, "epoch": 5.002736726874658, "step": 4570}, {"loss": 0.372, "grad_norm": 1.0533705949783325, "learning_rate": 0.0002, "epoch": 5.013683634373289, "step": 4580}, {"loss": 0.3313, "grad_norm": 1.268900752067566, "learning_rate": 0.0002, "epoch": 5.024630541871921, "step": 4590}, {"loss": 0.3482, "grad_norm": 1.222652554512024, "learning_rate": 0.0002, "epoch": 5.035577449370553, "step": 4600}, {"loss": 0.3195, "grad_norm": 1.5093127489089966, "learning_rate": 0.0002, "epoch": 5.046524356869185, "step": 4610}, {"loss": 0.3569, "grad_norm": 1.2372499704360962, "learning_rate": 0.0002, "epoch": 5.057471264367816, "step": 4620}, {"loss": 0.3206, "grad_norm": 0.8422666192054749, "learning_rate": 0.0002, "epoch": 5.068418171866448, "step": 4630}, {"loss": 0.3115, "grad_norm": 1.1451770067214966, "learning_rate": 0.0002, "epoch": 5.079365079365079, "step": 4640}, {"loss": 0.3305, "grad_norm": 1.2074557542800903, "learning_rate": 0.0002, "epoch": 5.090311986863711, "step": 4650}, {"loss": 0.3012, "grad_norm": 1.429150104522705, "learning_rate": 0.0002, "epoch": 5.101258894362343, "step": 4660}, {"loss": 0.3229, "grad_norm": 1.0353610515594482, "learning_rate": 0.0002, "epoch": 5.112205801860974, "step": 4670}, {"loss": 0.402, "grad_norm": 1.2845979928970337, "learning_rate": 0.0002, "epoch": 5.123152709359606, "step": 4680}, {"loss": 0.383, "grad_norm": 1.3790186643600464, "learning_rate": 0.0002, "epoch": 5.134099616858237, "step": 4690}, {"loss": 0.2951, "grad_norm": 1.3182239532470703, "learning_rate": 0.0002, "epoch": 5.145046524356869, "step": 4700}, {"loss": 0.4074, "grad_norm": 1.5249626636505127, "learning_rate": 0.0002, "epoch": 5.155993431855501, "step": 4710}, {"loss": 0.3703, "grad_norm": 1.2492733001708984, "learning_rate": 0.0002, "epoch": 5.166940339354133, "step": 4720}, {"loss": 0.3411, "grad_norm": 1.4455480575561523, "learning_rate": 0.0002, "epoch": 5.177887246852764, "step": 4730}, {"loss": 0.3996, "grad_norm": 1.2191482782363892, "learning_rate": 0.0002, "epoch": 5.188834154351396, "step": 4740}, {"loss": 0.3785, "grad_norm": 1.4707951545715332, "learning_rate": 0.0002, "epoch": 5.199781061850027, "step": 4750}, {"loss": 0.3516, "grad_norm": 1.3473678827285767, "learning_rate": 0.0002, "epoch": 5.210727969348659, "step": 4760}, {"loss": 0.3266, "grad_norm": 1.0479670763015747, "learning_rate": 0.0002, "epoch": 5.221674876847291, "step": 4770}, {"loss": 0.3976, "grad_norm": 1.299096703529358, "learning_rate": 0.0002, "epoch": 5.232621784345922, "step": 4780}, {"loss": 0.3266, "grad_norm": 1.2820168733596802, "learning_rate": 0.0002, "epoch": 5.243568691844554, "step": 4790}, {"loss": 0.3347, "grad_norm": 1.3818004131317139, "learning_rate": 0.0002, "epoch": 5.254515599343185, "step": 4800}, {"loss": 0.3761, "grad_norm": 1.2898736000061035, "learning_rate": 0.0002, "epoch": 5.265462506841817, "step": 4810}, {"loss": 0.3694, "grad_norm": 1.1761468648910522, "learning_rate": 0.0002, "epoch": 5.276409414340449, "step": 4820}, {"loss": 0.3806, "grad_norm": 1.7155952453613281, "learning_rate": 0.0002, "epoch": 5.287356321839081, "step": 4830}, {"loss": 0.322, "grad_norm": 0.9103642106056213, "learning_rate": 0.0002, "epoch": 5.298303229337712, "step": 4840}, {"loss": 0.3516, "grad_norm": 1.013015627861023, "learning_rate": 0.0002, "epoch": 5.309250136836344, "step": 4850}, {"loss": 0.4297, "grad_norm": 1.390471339225769, "learning_rate": 0.0002, "epoch": 5.320197044334975, "step": 4860}, {"loss": 0.4098, "grad_norm": 1.129770278930664, "learning_rate": 0.0002, "epoch": 5.331143951833607, "step": 4870}, {"loss": 0.4227, "grad_norm": 1.1461067199707031, "learning_rate": 0.0002, "epoch": 5.342090859332239, "step": 4880}, {"loss": 0.288, "grad_norm": 1.3587424755096436, "learning_rate": 0.0002, "epoch": 5.35303776683087, "step": 4890}, {"loss": 0.3604, "grad_norm": 1.6897879838943481, "learning_rate": 0.0002, "epoch": 5.363984674329502, "step": 4900}, {"loss": 0.3887, "grad_norm": 0.9298055768013, "learning_rate": 0.0002, "epoch": 5.374931581828133, "step": 4910}, {"loss": 0.3371, "grad_norm": 1.0006917715072632, "learning_rate": 0.0002, "epoch": 5.385878489326765, "step": 4920}, {"loss": 0.3992, "grad_norm": 1.232581377029419, "learning_rate": 0.0002, "epoch": 5.396825396825397, "step": 4930}, {"loss": 0.3456, "grad_norm": 1.0822620391845703, "learning_rate": 0.0002, "epoch": 5.407772304324029, "step": 4940}, {"loss": 0.3806, "grad_norm": 1.3648720979690552, "learning_rate": 0.0002, "epoch": 5.41871921182266, "step": 4950}, {"loss": 0.3959, "grad_norm": 1.3220354318618774, "learning_rate": 0.0002, "epoch": 5.429666119321292, "step": 4960}, {"loss": 0.3278, "grad_norm": 1.1106271743774414, "learning_rate": 0.0002, "epoch": 5.440613026819923, "step": 4970}, {"loss": 0.3812, "grad_norm": 1.6058908700942993, "learning_rate": 0.0002, "epoch": 5.451559934318555, "step": 4980}, {"loss": 0.3905, "grad_norm": 1.1065930128097534, "learning_rate": 0.0002, "epoch": 5.462506841817187, "step": 4990}, {"loss": 0.4058, "grad_norm": 1.3896466493606567, "learning_rate": 0.0002, "epoch": 5.473453749315818, "step": 5000}, {"loss": 0.4122, "grad_norm": 1.0437148809432983, "learning_rate": 0.0002, "epoch": 5.48440065681445, "step": 5010}, {"loss": 0.4065, "grad_norm": 1.2347718477249146, "learning_rate": 0.0002, "epoch": 5.495347564313081, "step": 5020}, {"loss": 0.3586, "grad_norm": 1.1174284219741821, "learning_rate": 0.0002, "epoch": 5.506294471811713, "step": 5030}, {"loss": 0.3576, "grad_norm": 1.2580941915512085, "learning_rate": 0.0002, "epoch": 5.517241379310345, "step": 5040}, {"loss": 0.3809, "grad_norm": 1.451090931892395, "learning_rate": 0.0002, "epoch": 5.528188286808977, "step": 5050}, {"loss": 0.3645, "grad_norm": 1.4688365459442139, "learning_rate": 0.0002, "epoch": 5.539135194307608, "step": 5060}, {"loss": 0.4431, "grad_norm": 1.1625734567642212, "learning_rate": 0.0002, "epoch": 5.55008210180624, "step": 5070}, {"loss": 0.3972, "grad_norm": 0.9332265257835388, "learning_rate": 0.0002, "epoch": 5.561029009304871, "step": 5080}, {"loss": 0.4, "grad_norm": 1.5635273456573486, "learning_rate": 0.0002, "epoch": 5.571975916803503, "step": 5090}, {"loss": 0.3651, "grad_norm": 1.3420509099960327, "learning_rate": 0.0002, "epoch": 5.582922824302135, "step": 5100}, {"loss": 0.3717, "grad_norm": 1.5826557874679565, "learning_rate": 0.0002, "epoch": 5.593869731800766, "step": 5110}, {"loss": 0.4256, "grad_norm": 1.5737065076828003, "learning_rate": 0.0002, "epoch": 5.604816639299398, "step": 5120}, {"loss": 0.39, "grad_norm": 1.3812499046325684, "learning_rate": 0.0002, "epoch": 5.615763546798029, "step": 5130}, {"loss": 0.3891, "grad_norm": 1.362833023071289, "learning_rate": 0.0002, "epoch": 5.626710454296661, "step": 5140}, {"loss": 0.455, "grad_norm": 1.7667874097824097, "learning_rate": 0.0002, "epoch": 5.637657361795293, "step": 5150}, {"loss": 0.4264, "grad_norm": 1.2661789655685425, "learning_rate": 0.0002, "epoch": 5.648604269293925, "step": 5160}, {"loss": 0.3261, "grad_norm": 1.2076870203018188, "learning_rate": 0.0002, "epoch": 5.659551176792556, "step": 5170}, {"loss": 0.372, "grad_norm": 1.2431524991989136, "learning_rate": 0.0002, "epoch": 5.670498084291188, "step": 5180}, {"loss": 0.4092, "grad_norm": 1.2216639518737793, "learning_rate": 0.0002, "epoch": 5.681444991789819, "step": 5190}, {"loss": 0.4171, "grad_norm": 0.9259352684020996, "learning_rate": 0.0002, "epoch": 5.692391899288451, "step": 5200}, {"loss": 0.3875, "grad_norm": 1.7929338216781616, "learning_rate": 0.0002, "epoch": 5.703338806787083, "step": 5210}, {"loss": 0.4424, "grad_norm": 1.4048460721969604, "learning_rate": 0.0002, "epoch": 5.714285714285714, "step": 5220}, {"loss": 0.3758, "grad_norm": 1.306874394416809, "learning_rate": 0.0002, "epoch": 5.725232621784346, "step": 5230}, {"loss": 0.3889, "grad_norm": 1.3137940168380737, "learning_rate": 0.0002, "epoch": 5.736179529282977, "step": 5240}, {"loss": 0.4804, "grad_norm": 1.1376476287841797, "learning_rate": 0.0002, "epoch": 5.747126436781609, "step": 5250}, {"loss": 0.377, "grad_norm": 1.450939416885376, "learning_rate": 0.0002, "epoch": 5.758073344280241, "step": 5260}, {"loss": 0.4732, "grad_norm": 0.983195960521698, "learning_rate": 0.0002, "epoch": 5.769020251778873, "step": 5270}, {"loss": 0.4041, "grad_norm": 1.66558837890625, "learning_rate": 0.0002, "epoch": 5.779967159277504, "step": 5280}, {"loss": 0.3643, "grad_norm": 0.9789204597473145, "learning_rate": 0.0002, "epoch": 5.790914066776136, "step": 5290}, {"loss": 0.3776, "grad_norm": 1.2110556364059448, "learning_rate": 0.0002, "epoch": 5.801860974274767, "step": 5300}, {"loss": 0.4049, "grad_norm": 1.3799304962158203, "learning_rate": 0.0002, "epoch": 5.812807881773399, "step": 5310}, {"loss": 0.4362, "grad_norm": 1.0570626258850098, "learning_rate": 0.0002, "epoch": 5.823754789272031, "step": 5320}, {"loss": 0.4716, "grad_norm": 1.4654436111450195, "learning_rate": 0.0002, "epoch": 5.834701696770662, "step": 5330}, {"loss": 0.4048, "grad_norm": 1.5216940641403198, "learning_rate": 0.0002, "epoch": 5.845648604269294, "step": 5340}, {"loss": 0.3848, "grad_norm": 1.018646001815796, "learning_rate": 0.0002, "epoch": 5.856595511767925, "step": 5350}, {"loss": 0.3705, "grad_norm": 1.028951644897461, "learning_rate": 0.0002, "epoch": 5.867542419266557, "step": 5360}, {"loss": 0.4213, "grad_norm": 2.571263313293457, "learning_rate": 0.0002, "epoch": 5.878489326765189, "step": 5370}, {"loss": 0.3647, "grad_norm": 1.3323984146118164, "learning_rate": 0.0002, "epoch": 5.889436234263821, "step": 5380}, {"loss": 0.4085, "grad_norm": 1.4317777156829834, "learning_rate": 0.0002, "epoch": 5.900383141762452, "step": 5390}, {"loss": 0.4254, "grad_norm": 1.4289140701293945, "learning_rate": 0.0002, "epoch": 5.911330049261084, "step": 5400}, {"loss": 0.3993, "grad_norm": 1.3130780458450317, "learning_rate": 0.0002, "epoch": 5.922276956759715, "step": 5410}, {"loss": 0.4025, "grad_norm": 1.3979902267456055, "learning_rate": 0.0002, "epoch": 5.933223864258347, "step": 5420}, {"loss": 0.3997, "grad_norm": 1.1827352046966553, "learning_rate": 0.0002, "epoch": 5.944170771756979, "step": 5430}, {"loss": 0.4163, "grad_norm": 1.1672080755233765, "learning_rate": 0.0002, "epoch": 5.95511767925561, "step": 5440}, {"loss": 0.4425, "grad_norm": 1.0949620008468628, "learning_rate": 0.0002, "epoch": 5.966064586754242, "step": 5450}, {"loss": 0.4219, "grad_norm": 1.3183925151824951, "learning_rate": 0.0002, "epoch": 5.977011494252873, "step": 5460}, {"loss": 0.4171, "grad_norm": 1.096198320388794, "learning_rate": 0.0002, "epoch": 5.987958401751505, "step": 5470}, {"loss": 0.3886, "grad_norm": 1.2601423263549805, "learning_rate": 0.0002, "epoch": 5.998905309250137, "step": 5480}]} +{"epoch": 6.999452654625069, "step": 6394, "epoch_duration": 1359.7705018520355, "total_accumulated_duration": 9530.591376304626, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4039, "grad_norm": 0.7611560821533203, "learning_rate": 0.0002, "epoch": 0.010946907498631636, "step": 10}, {"loss": 1.8818, "grad_norm": 0.4633193612098694, "learning_rate": 0.0002, "epoch": 0.021893814997263273, "step": 20}, {"loss": 1.5927, "grad_norm": 0.49326154589653015, "learning_rate": 0.0002, "epoch": 0.03284072249589491, "step": 30}, {"loss": 1.3859, "grad_norm": 0.3943138122558594, "learning_rate": 0.0002, "epoch": 0.043787629994526546, "step": 40}, {"loss": 1.3036, "grad_norm": 0.43292930722236633, "learning_rate": 0.0002, "epoch": 0.05473453749315818, "step": 50}, {"loss": 1.2427, "grad_norm": 0.3431817591190338, "learning_rate": 0.0002, "epoch": 0.06568144499178982, "step": 60}, {"loss": 1.3076, "grad_norm": 0.38011446595191956, "learning_rate": 0.0002, "epoch": 0.07662835249042145, "step": 70}, {"loss": 1.1641, "grad_norm": 0.7441071271896362, "learning_rate": 0.0002, "epoch": 0.08757525998905309, "step": 80}, {"loss": 1.1721, "grad_norm": 0.3359833061695099, "learning_rate": 0.0002, "epoch": 0.09852216748768473, "step": 90}, {"loss": 1.2774, "grad_norm": 0.3724392354488373, "learning_rate": 0.0002, "epoch": 0.10946907498631636, "step": 100}, {"loss": 1.216, "grad_norm": 0.40673762559890747, "learning_rate": 0.0002, "epoch": 0.120415982484948, "step": 110}, {"loss": 1.2611, "grad_norm": 0.40036800503730774, "learning_rate": 0.0002, "epoch": 0.13136288998357964, "step": 120}, {"loss": 1.2436, "grad_norm": 2.844191312789917, "learning_rate": 0.0002, "epoch": 0.1423097974822113, "step": 130}, {"loss": 1.2254, "grad_norm": 0.3104734420776367, "learning_rate": 0.0002, "epoch": 0.1532567049808429, "step": 140}, {"loss": 1.0498, "grad_norm": 0.3266797959804535, "learning_rate": 0.0002, "epoch": 0.16420361247947454, "step": 150}, {"loss": 1.1644, "grad_norm": 0.3079199194908142, "learning_rate": 0.0002, "epoch": 0.17515051997810618, "step": 160}, {"loss": 1.2923, "grad_norm": 0.3872479498386383, "learning_rate": 0.0002, "epoch": 0.18609742747673783, "step": 170}, {"loss": 1.1809, "grad_norm": 0.38654500246047974, "learning_rate": 0.0002, "epoch": 0.19704433497536947, "step": 180}, {"loss": 1.066, "grad_norm": 0.2913552522659302, "learning_rate": 0.0002, "epoch": 0.20799124247400108, "step": 190}, {"loss": 1.0868, "grad_norm": 0.2960572838783264, "learning_rate": 0.0002, "epoch": 0.21893814997263272, "step": 200}, {"loss": 1.136, "grad_norm": 0.5175501108169556, "learning_rate": 0.0002, "epoch": 0.22988505747126436, "step": 210}, {"loss": 1.1556, "grad_norm": 1.2921574115753174, "learning_rate": 0.0002, "epoch": 0.240831964969896, "step": 220}, {"loss": 1.1488, "grad_norm": 0.2675512135028839, "learning_rate": 0.0002, "epoch": 0.25177887246852765, "step": 230}, {"loss": 1.2764, "grad_norm": 0.3956190049648285, "learning_rate": 0.0002, "epoch": 0.2627257799671593, "step": 240}, {"loss": 1.1889, "grad_norm": 0.6022581458091736, "learning_rate": 0.0002, "epoch": 0.27367268746579093, "step": 250}, {"loss": 1.1981, "grad_norm": 1.1949563026428223, "learning_rate": 0.0002, "epoch": 0.2846195949644226, "step": 260}, {"loss": 1.1877, "grad_norm": 0.31173548102378845, "learning_rate": 0.0002, "epoch": 0.2955665024630542, "step": 270}, {"loss": 1.06, "grad_norm": 0.2808472812175751, "learning_rate": 0.0002, "epoch": 0.3065134099616858, "step": 280}, {"loss": 1.1752, "grad_norm": 0.3042023777961731, "learning_rate": 0.0002, "epoch": 0.31746031746031744, "step": 290}, {"loss": 1.3147, "grad_norm": 0.39915043115615845, "learning_rate": 0.0002, "epoch": 0.3284072249589491, "step": 300}, {"loss": 1.2425, "grad_norm": 0.39118197560310364, "learning_rate": 0.0002, "epoch": 0.3393541324575807, "step": 310}, {"loss": 1.1363, "grad_norm": 0.355010986328125, "learning_rate": 0.0002, "epoch": 0.35030103995621237, "step": 320}, {"loss": 1.1925, "grad_norm": 0.29734086990356445, "learning_rate": 0.0002, "epoch": 0.361247947454844, "step": 330}, {"loss": 1.1974, "grad_norm": 0.346096009016037, "learning_rate": 0.0002, "epoch": 0.37219485495347565, "step": 340}, {"loss": 1.1641, "grad_norm": 0.4829643666744232, "learning_rate": 0.0002, "epoch": 0.3831417624521073, "step": 350}, {"loss": 1.2808, "grad_norm": 0.4726872742176056, "learning_rate": 0.0002, "epoch": 0.39408866995073893, "step": 360}, {"loss": 1.1532, "grad_norm": 0.3130153715610504, "learning_rate": 0.0002, "epoch": 0.4050355774493706, "step": 370}, {"loss": 1.1842, "grad_norm": 0.5123590230941772, "learning_rate": 0.0002, "epoch": 0.41598248494800216, "step": 380}, {"loss": 1.1539, "grad_norm": 0.3444574773311615, "learning_rate": 0.0002, "epoch": 0.4269293924466338, "step": 390}, {"loss": 1.1756, "grad_norm": 0.5302175283432007, "learning_rate": 0.0002, "epoch": 0.43787629994526545, "step": 400}, {"loss": 1.1138, "grad_norm": 0.2713572680950165, "learning_rate": 0.0002, "epoch": 0.4488232074438971, "step": 410}, {"loss": 1.1281, "grad_norm": 0.29530611634254456, "learning_rate": 0.0002, "epoch": 0.45977011494252873, "step": 420}, {"loss": 1.1721, "grad_norm": 0.27282455563545227, "learning_rate": 0.0002, "epoch": 0.47071702244116037, "step": 430}, {"loss": 1.1213, "grad_norm": 0.2647949755191803, "learning_rate": 0.0002, "epoch": 0.481663929939792, "step": 440}, {"loss": 1.1656, "grad_norm": 0.35509347915649414, "learning_rate": 0.0002, "epoch": 0.49261083743842365, "step": 450}, {"loss": 1.1251, "grad_norm": 0.1959609091281891, "learning_rate": 0.0002, "epoch": 0.5035577449370553, "step": 460}, {"loss": 1.1889, "grad_norm": 0.40090155601501465, "learning_rate": 0.0002, "epoch": 0.5145046524356869, "step": 470}, {"loss": 1.156, "grad_norm": 0.3354604244232178, "learning_rate": 0.0002, "epoch": 0.5254515599343186, "step": 480}, {"loss": 1.1024, "grad_norm": 0.2758506238460541, "learning_rate": 0.0002, "epoch": 0.5363984674329502, "step": 490}, {"loss": 1.1108, "grad_norm": 0.3629051744937897, "learning_rate": 0.0002, "epoch": 0.5473453749315819, "step": 500}, {"loss": 1.2236, "grad_norm": 0.30802229046821594, "learning_rate": 0.0002, "epoch": 0.5582922824302134, "step": 510}, {"loss": 1.0424, "grad_norm": 0.3099463880062103, "learning_rate": 0.0002, "epoch": 0.5692391899288451, "step": 520}, {"loss": 1.255, "grad_norm": 0.42299067974090576, "learning_rate": 0.0002, "epoch": 0.5801860974274767, "step": 530}, {"loss": 1.1698, "grad_norm": 0.5392252802848816, "learning_rate": 0.0002, "epoch": 0.5911330049261084, "step": 540}, {"loss": 1.1171, "grad_norm": 0.34768250584602356, "learning_rate": 0.0002, "epoch": 0.60207991242474, "step": 550}, {"loss": 1.102, "grad_norm": 0.28490015864372253, "learning_rate": 0.0002, "epoch": 0.6130268199233716, "step": 560}, {"loss": 1.1111, "grad_norm": 0.34787994623184204, "learning_rate": 0.0002, "epoch": 0.6239737274220033, "step": 570}, {"loss": 1.0759, "grad_norm": 0.29058772325515747, "learning_rate": 0.0002, "epoch": 0.6349206349206349, "step": 580}, {"loss": 1.1157, "grad_norm": 0.4063778817653656, "learning_rate": 0.0002, "epoch": 0.6458675424192666, "step": 590}, {"loss": 1.1432, "grad_norm": 0.9244267344474792, "learning_rate": 0.0002, "epoch": 0.6568144499178982, "step": 600}, {"loss": 1.0591, "grad_norm": 0.27605190873146057, "learning_rate": 0.0002, "epoch": 0.6677613574165299, "step": 610}, {"loss": 1.2123, "grad_norm": 0.34346821904182434, "learning_rate": 0.0002, "epoch": 0.6787082649151615, "step": 620}, {"loss": 1.2195, "grad_norm": 0.3093279302120209, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 630}, {"loss": 1.2461, "grad_norm": 1.0069009065628052, "learning_rate": 0.0002, "epoch": 0.7006020799124247, "step": 640}, {"loss": 1.0892, "grad_norm": 0.5049130916595459, "learning_rate": 0.0002, "epoch": 0.7115489874110563, "step": 650}, {"loss": 1.1404, "grad_norm": 0.3748924732208252, "learning_rate": 0.0002, "epoch": 0.722495894909688, "step": 660}, {"loss": 1.1062, "grad_norm": 0.2964959144592285, "learning_rate": 0.0002, "epoch": 0.7334428024083196, "step": 670}, {"loss": 1.2617, "grad_norm": 0.4599986970424652, "learning_rate": 0.0002, "epoch": 0.7443897099069513, "step": 680}, {"loss": 1.088, "grad_norm": 0.27292951941490173, "learning_rate": 0.0002, "epoch": 0.7553366174055829, "step": 690}, {"loss": 1.2047, "grad_norm": 0.3123566806316376, "learning_rate": 0.0002, "epoch": 0.7662835249042146, "step": 700}, {"loss": 1.0021, "grad_norm": 0.28310710191726685, "learning_rate": 0.0002, "epoch": 0.7772304324028462, "step": 710}, {"loss": 1.2281, "grad_norm": 0.3279992341995239, "learning_rate": 0.0002, "epoch": 0.7881773399014779, "step": 720}, {"loss": 1.25, "grad_norm": 0.28179168701171875, "learning_rate": 0.0002, "epoch": 0.7991242474001095, "step": 730}, {"loss": 1.0602, "grad_norm": 0.31492987275123596, "learning_rate": 0.0002, "epoch": 0.8100711548987412, "step": 740}, {"loss": 1.2518, "grad_norm": 0.41821011900901794, "learning_rate": 0.0002, "epoch": 0.8210180623973727, "step": 750}, {"loss": 1.1612, "grad_norm": 0.325235515832901, "learning_rate": 0.0002, "epoch": 0.8319649698960043, "step": 760}, {"loss": 1.27, "grad_norm": 0.5366070866584778, "learning_rate": 0.0002, "epoch": 0.842911877394636, "step": 770}, {"loss": 1.0921, "grad_norm": 0.32570579648017883, "learning_rate": 0.0002, "epoch": 0.8538587848932676, "step": 780}, {"loss": 1.1032, "grad_norm": 0.3642968237400055, "learning_rate": 0.0002, "epoch": 0.8648056923918993, "step": 790}, {"loss": 1.1234, "grad_norm": 0.29713448882102966, "learning_rate": 0.0002, "epoch": 0.8757525998905309, "step": 800}, {"loss": 1.0978, "grad_norm": 0.23599444329738617, "learning_rate": 0.0002, "epoch": 0.8866995073891626, "step": 810}, {"loss": 1.1867, "grad_norm": 0.31522464752197266, "learning_rate": 0.0002, "epoch": 0.8976464148877942, "step": 820}, {"loss": 1.0208, "grad_norm": 0.32754790782928467, "learning_rate": 0.0002, "epoch": 0.9085933223864259, "step": 830}, {"loss": 0.9786, "grad_norm": 0.22741089761257172, "learning_rate": 0.0002, "epoch": 0.9195402298850575, "step": 840}, {"loss": 1.0689, "grad_norm": 0.3089679777622223, "learning_rate": 0.0002, "epoch": 0.9304871373836892, "step": 850}, {"loss": 1.0354, "grad_norm": 0.27440521121025085, "learning_rate": 0.0002, "epoch": 0.9414340448823207, "step": 860}, {"loss": 1.0417, "grad_norm": 0.3498363792896271, "learning_rate": 0.0002, "epoch": 0.9523809523809523, "step": 870}, {"loss": 1.269, "grad_norm": 0.47151854634284973, "learning_rate": 0.0002, "epoch": 0.963327859879584, "step": 880}, {"loss": 1.1174, "grad_norm": 0.24756591022014618, "learning_rate": 0.0002, "epoch": 0.9742747673782156, "step": 890}, {"loss": 1.0622, "grad_norm": 0.2600938677787781, "learning_rate": 0.0002, "epoch": 0.9852216748768473, "step": 900}, {"loss": 1.0968, "grad_norm": 0.2934586703777313, "learning_rate": 0.0002, "epoch": 0.9961685823754789, "step": 910}, {"eval_loss": 1.158464789390564, "eval_runtime": 46.0774, "eval_samples_per_second": 9.462, "eval_steps_per_second": 1.194, "epoch": 0.9994526546250684, "step": 913}, {"loss": 1.0322, "grad_norm": 0.37776654958724976, "learning_rate": 0.0002, "epoch": 1.0071154898741106, "step": 920}, {"loss": 1.1346, "grad_norm": 0.31784629821777344, "learning_rate": 0.0002, "epoch": 1.0180623973727423, "step": 930}, {"loss": 0.8976, "grad_norm": 0.24244336783885956, "learning_rate": 0.0002, "epoch": 1.0290093048713738, "step": 940}, {"loss": 1.0172, "grad_norm": 0.3185454308986664, "learning_rate": 0.0002, "epoch": 1.0399562123700055, "step": 950}, {"loss": 1.1645, "grad_norm": 0.3589441478252411, "learning_rate": 0.0002, "epoch": 1.0509031198686372, "step": 960}, {"loss": 1.0317, "grad_norm": 0.38593578338623047, "learning_rate": 0.0002, "epoch": 1.0618500273672686, "step": 970}, {"loss": 0.9913, "grad_norm": 0.39694955945014954, "learning_rate": 0.0002, "epoch": 1.0727969348659003, "step": 980}, {"loss": 1.0893, "grad_norm": 0.469817191362381, "learning_rate": 0.0002, "epoch": 1.083743842364532, "step": 990}, {"loss": 0.983, "grad_norm": 0.2634755074977875, "learning_rate": 0.0002, "epoch": 1.0946907498631637, "step": 1000}, {"loss": 1.0144, "grad_norm": 0.43189436197280884, "learning_rate": 0.0002, "epoch": 1.1056376573617952, "step": 1010}, {"loss": 0.9663, "grad_norm": 0.5559977889060974, "learning_rate": 0.0002, "epoch": 1.116584564860427, "step": 1020}, {"loss": 1.0481, "grad_norm": 0.32100191712379456, "learning_rate": 0.0002, "epoch": 1.1275314723590586, "step": 1030}, {"loss": 1.1012, "grad_norm": 0.40179768204689026, "learning_rate": 0.0002, "epoch": 1.1384783798576903, "step": 1040}, {"loss": 1.0029, "grad_norm": 0.3659493029117584, "learning_rate": 0.0002, "epoch": 1.1494252873563218, "step": 1050}, {"loss": 0.9597, "grad_norm": 0.701704204082489, "learning_rate": 0.0002, "epoch": 1.1603721948549535, "step": 1060}, {"loss": 1.0204, "grad_norm": 0.3650563359260559, "learning_rate": 0.0002, "epoch": 1.1713191023535852, "step": 1070}, {"loss": 0.907, "grad_norm": 0.3191976249217987, "learning_rate": 0.0002, "epoch": 1.1822660098522166, "step": 1080}, {"loss": 1.0648, "grad_norm": 0.3615441918373108, "learning_rate": 0.0002, "epoch": 1.1932129173508483, "step": 1090}, {"loss": 1.0067, "grad_norm": 0.39474231004714966, "learning_rate": 0.0002, "epoch": 1.20415982484948, "step": 1100}, {"loss": 0.9852, "grad_norm": 0.3752822279930115, "learning_rate": 0.0002, "epoch": 1.2151067323481117, "step": 1110}, {"loss": 1.0067, "grad_norm": 0.4165991246700287, "learning_rate": 0.0002, "epoch": 1.2260536398467432, "step": 1120}, {"loss": 1.0244, "grad_norm": 0.5326506495475769, "learning_rate": 0.0002, "epoch": 1.237000547345375, "step": 1130}, {"loss": 1.0542, "grad_norm": 0.48845794796943665, "learning_rate": 0.0002, "epoch": 1.2479474548440066, "step": 1140}, {"loss": 1.0885, "grad_norm": 0.29910150170326233, "learning_rate": 0.0002, "epoch": 1.2588943623426383, "step": 1150}, {"loss": 1.2233, "grad_norm": 0.5069725513458252, "learning_rate": 0.0002, "epoch": 1.2698412698412698, "step": 1160}, {"loss": 1.0992, "grad_norm": 0.29500406980514526, "learning_rate": 0.0002, "epoch": 1.2807881773399015, "step": 1170}, {"loss": 1.0291, "grad_norm": 0.4711538851261139, "learning_rate": 0.0002, "epoch": 1.2917350848385332, "step": 1180}, {"loss": 1.0513, "grad_norm": 0.4203340709209442, "learning_rate": 0.0002, "epoch": 1.3026819923371646, "step": 1190}, {"loss": 1.1274, "grad_norm": 0.36101874709129333, "learning_rate": 0.0002, "epoch": 1.3136288998357963, "step": 1200}, {"loss": 1.085, "grad_norm": 0.4608800411224365, "learning_rate": 0.0002, "epoch": 1.324575807334428, "step": 1210}, {"loss": 1.0695, "grad_norm": 0.6570906639099121, "learning_rate": 0.0002, "epoch": 1.3355227148330597, "step": 1220}, {"loss": 0.9786, "grad_norm": 0.5352164506912231, "learning_rate": 0.0002, "epoch": 1.3464696223316914, "step": 1230}, {"loss": 1.0513, "grad_norm": 0.3885001242160797, "learning_rate": 0.0002, "epoch": 1.357416529830323, "step": 1240}, {"loss": 1.0611, "grad_norm": 0.2987913489341736, "learning_rate": 0.0002, "epoch": 1.3683634373289546, "step": 1250}, {"loss": 1.109, "grad_norm": 0.42070427536964417, "learning_rate": 0.0002, "epoch": 1.3793103448275863, "step": 1260}, {"loss": 1.1092, "grad_norm": 0.5957782864570618, "learning_rate": 0.0002, "epoch": 1.3902572523262178, "step": 1270}, {"loss": 1.0554, "grad_norm": 0.32898882031440735, "learning_rate": 0.0002, "epoch": 1.4012041598248495, "step": 1280}, {"loss": 0.909, "grad_norm": 0.27624452114105225, "learning_rate": 0.0002, "epoch": 1.4121510673234812, "step": 1290}, {"loss": 1.024, "grad_norm": 0.49570828676223755, "learning_rate": 0.0002, "epoch": 1.4230979748221126, "step": 1300}, {"loss": 1.0471, "grad_norm": 0.26191383600234985, "learning_rate": 0.0002, "epoch": 1.4340448823207443, "step": 1310}, {"loss": 1.0788, "grad_norm": 0.35664042830467224, "learning_rate": 0.0002, "epoch": 1.444991789819376, "step": 1320}, {"loss": 1.0108, "grad_norm": 0.45126354694366455, "learning_rate": 0.0002, "epoch": 1.4559386973180077, "step": 1330}, {"loss": 0.9571, "grad_norm": 0.37318357825279236, "learning_rate": 0.0002, "epoch": 1.4668856048166394, "step": 1340}, {"loss": 1.0507, "grad_norm": 0.6428970098495483, "learning_rate": 0.0002, "epoch": 1.477832512315271, "step": 1350}, {"loss": 1.0367, "grad_norm": 0.43256187438964844, "learning_rate": 0.0002, "epoch": 1.4887794198139026, "step": 1360}, {"loss": 1.1321, "grad_norm": 0.5343793630599976, "learning_rate": 0.0002, "epoch": 1.4997263273125343, "step": 1370}, {"loss": 1.1054, "grad_norm": 0.315437376499176, "learning_rate": 0.0002, "epoch": 1.5106732348111658, "step": 1380}, {"loss": 0.8916, "grad_norm": 0.41561153531074524, "learning_rate": 0.0002, "epoch": 1.5216201423097975, "step": 1390}, {"loss": 1.0391, "grad_norm": 0.3201070725917816, "learning_rate": 0.0002, "epoch": 1.5325670498084292, "step": 1400}, {"loss": 1.14, "grad_norm": 0.505537211894989, "learning_rate": 0.0002, "epoch": 1.5435139573070606, "step": 1410}, {"loss": 1.0775, "grad_norm": 0.3747410178184509, "learning_rate": 0.0002, "epoch": 1.5544608648056923, "step": 1420}, {"loss": 1.1171, "grad_norm": 0.49385908246040344, "learning_rate": 0.0002, "epoch": 1.565407772304324, "step": 1430}, {"loss": 1.1182, "grad_norm": 0.49831628799438477, "learning_rate": 0.0002, "epoch": 1.5763546798029555, "step": 1440}, {"loss": 1.0079, "grad_norm": 0.372127890586853, "learning_rate": 0.0002, "epoch": 1.5873015873015874, "step": 1450}, {"loss": 0.9931, "grad_norm": 0.40070840716362, "learning_rate": 0.0002, "epoch": 1.598248494800219, "step": 1460}, {"loss": 0.8954, "grad_norm": 0.34907400608062744, "learning_rate": 0.0002, "epoch": 1.6091954022988506, "step": 1470}, {"loss": 0.9743, "grad_norm": 0.4632418751716614, "learning_rate": 0.0002, "epoch": 1.6201423097974823, "step": 1480}, {"loss": 1.0103, "grad_norm": 0.40164515376091003, "learning_rate": 0.0002, "epoch": 1.6310892172961138, "step": 1490}, {"loss": 0.9523, "grad_norm": 0.3214994966983795, "learning_rate": 0.0002, "epoch": 1.6420361247947455, "step": 1500}, {"loss": 1.0161, "grad_norm": 0.3727897107601166, "learning_rate": 0.0002, "epoch": 1.6529830322933772, "step": 1510}, {"loss": 1.0443, "grad_norm": 0.3817640542984009, "learning_rate": 0.0002, "epoch": 1.6639299397920086, "step": 1520}, {"loss": 1.0511, "grad_norm": 0.5592136979103088, "learning_rate": 0.0002, "epoch": 1.6748768472906403, "step": 1530}, {"loss": 0.9682, "grad_norm": 0.44636598229408264, "learning_rate": 0.0002, "epoch": 1.685823754789272, "step": 1540}, {"loss": 1.033, "grad_norm": 0.40441709756851196, "learning_rate": 0.0002, "epoch": 1.6967706622879035, "step": 1550}, {"loss": 1.0857, "grad_norm": 0.3243522644042969, "learning_rate": 0.0002, "epoch": 1.7077175697865354, "step": 1560}, {"loss": 0.9482, "grad_norm": 0.34277570247650146, "learning_rate": 0.0002, "epoch": 1.718664477285167, "step": 1570}, {"loss": 0.974, "grad_norm": 0.3279995024204254, "learning_rate": 0.0002, "epoch": 1.7296113847837986, "step": 1580}, {"loss": 0.9414, "grad_norm": 0.41968777775764465, "learning_rate": 0.0002, "epoch": 1.7405582922824303, "step": 1590}, {"loss": 0.9768, "grad_norm": 0.39464613795280457, "learning_rate": 0.0002, "epoch": 1.7515051997810618, "step": 1600}, {"loss": 1.0347, "grad_norm": 0.3839009404182434, "learning_rate": 0.0002, "epoch": 1.7624521072796935, "step": 1610}, {"loss": 0.9195, "grad_norm": 0.3250715434551239, "learning_rate": 0.0002, "epoch": 1.7733990147783252, "step": 1620}, {"loss": 1.0049, "grad_norm": 0.5166561007499695, "learning_rate": 0.0002, "epoch": 1.7843459222769567, "step": 1630}, {"loss": 1.0364, "grad_norm": 0.4115183353424072, "learning_rate": 0.0002, "epoch": 1.7952928297755884, "step": 1640}, {"loss": 1.0248, "grad_norm": 0.373780220746994, "learning_rate": 0.0002, "epoch": 1.80623973727422, "step": 1650}, {"loss": 1.0984, "grad_norm": 0.49697014689445496, "learning_rate": 0.0002, "epoch": 1.8171866447728515, "step": 1660}, {"loss": 1.0089, "grad_norm": 1.0308938026428223, "learning_rate": 0.0002, "epoch": 1.8281335522714834, "step": 1670}, {"loss": 1.0853, "grad_norm": 0.4851366877555847, "learning_rate": 0.0002, "epoch": 1.839080459770115, "step": 1680}, {"loss": 0.9533, "grad_norm": 0.3262481391429901, "learning_rate": 0.0002, "epoch": 1.8500273672687466, "step": 1690}, {"loss": 1.048, "grad_norm": 0.6904496550559998, "learning_rate": 0.0002, "epoch": 1.8609742747673783, "step": 1700}, {"loss": 1.0577, "grad_norm": 0.49789851903915405, "learning_rate": 0.0002, "epoch": 1.8719211822660098, "step": 1710}, {"loss": 1.0258, "grad_norm": 0.3035794198513031, "learning_rate": 0.0002, "epoch": 1.8828680897646415, "step": 1720}, {"loss": 0.9916, "grad_norm": 0.4588414430618286, "learning_rate": 0.0002, "epoch": 1.8938149972632732, "step": 1730}, {"loss": 0.9526, "grad_norm": 0.4313034117221832, "learning_rate": 0.0002, "epoch": 1.9047619047619047, "step": 1740}, {"loss": 1.0857, "grad_norm": 0.38562044501304626, "learning_rate": 0.0002, "epoch": 1.9157088122605364, "step": 1750}, {"loss": 0.9763, "grad_norm": 0.46947410702705383, "learning_rate": 0.0002, "epoch": 1.926655719759168, "step": 1760}, {"loss": 0.9967, "grad_norm": 0.3848404884338379, "learning_rate": 0.0002, "epoch": 1.9376026272577995, "step": 1770}, {"loss": 1.0474, "grad_norm": 0.30422744154930115, "learning_rate": 0.0002, "epoch": 1.9485495347564314, "step": 1780}, {"loss": 1.022, "grad_norm": 0.41100990772247314, "learning_rate": 0.0002, "epoch": 1.959496442255063, "step": 1790}, {"loss": 1.003, "grad_norm": 0.3492335081100464, "learning_rate": 0.0002, "epoch": 1.9704433497536946, "step": 1800}, {"loss": 0.9891, "grad_norm": 0.364577978849411, "learning_rate": 0.0002, "epoch": 1.9813902572523263, "step": 1810}, {"loss": 1.0699, "grad_norm": 0.4312075674533844, "learning_rate": 0.0002, "epoch": 1.9923371647509578, "step": 1820}, {"eval_loss": 1.14472496509552, "eval_runtime": 46.0786, "eval_samples_per_second": 9.462, "eval_steps_per_second": 1.194, "epoch": 2.0, "step": 1827}, {"loss": 1.0218, "grad_norm": 0.5989689230918884, "learning_rate": 0.0002, "epoch": 2.0032840722495897, "step": 1830}, {"loss": 0.9384, "grad_norm": 0.49720922112464905, "learning_rate": 0.0002, "epoch": 2.014230979748221, "step": 1840}, {"loss": 0.8482, "grad_norm": 0.42675456404685974, "learning_rate": 0.0002, "epoch": 2.0251778872468527, "step": 1850}, {"loss": 0.8487, "grad_norm": 0.4637208580970764, "learning_rate": 0.0002, "epoch": 2.0361247947454846, "step": 1860}, {"loss": 0.8212, "grad_norm": 0.8329976797103882, "learning_rate": 0.0002, "epoch": 2.047071702244116, "step": 1870}, {"loss": 0.9547, "grad_norm": 0.7869427800178528, "learning_rate": 0.0002, "epoch": 2.0580186097427475, "step": 1880}, {"loss": 0.9351, "grad_norm": 0.4927455186843872, "learning_rate": 0.0002, "epoch": 2.0689655172413794, "step": 1890}, {"loss": 0.8652, "grad_norm": 0.6264246702194214, "learning_rate": 0.0002, "epoch": 2.079912424740011, "step": 1900}, {"loss": 0.9095, "grad_norm": 1.1164122819900513, "learning_rate": 0.0002, "epoch": 2.0908593322386424, "step": 1910}, {"loss": 0.7823, "grad_norm": 0.5283981561660767, "learning_rate": 0.0002, "epoch": 2.1018062397372743, "step": 1920}, {"loss": 0.8065, "grad_norm": 0.45621731877326965, "learning_rate": 0.0002, "epoch": 2.112753147235906, "step": 1930}, {"loss": 0.9184, "grad_norm": 1.381791591644287, "learning_rate": 0.0002, "epoch": 2.1237000547345373, "step": 1940}, {"loss": 0.9006, "grad_norm": 0.5151259899139404, "learning_rate": 0.0002, "epoch": 2.134646962233169, "step": 1950}, {"loss": 0.8436, "grad_norm": 0.9806339740753174, "learning_rate": 0.0002, "epoch": 2.1455938697318007, "step": 1960}, {"loss": 0.8749, "grad_norm": 0.4734154939651489, "learning_rate": 0.0002, "epoch": 2.1565407772304326, "step": 1970}, {"loss": 0.9172, "grad_norm": 0.9553168416023254, "learning_rate": 0.0002, "epoch": 2.167487684729064, "step": 1980}, {"loss": 0.8047, "grad_norm": 0.5895838141441345, "learning_rate": 0.0002, "epoch": 2.1784345922276955, "step": 1990}, {"loss": 0.7841, "grad_norm": 0.4488855302333832, "learning_rate": 0.0002, "epoch": 2.1893814997263275, "step": 2000}, {"loss": 0.8205, "grad_norm": 1.0760235786437988, "learning_rate": 0.0002, "epoch": 2.200328407224959, "step": 2010}, {"loss": 0.7923, "grad_norm": 0.5038785338401794, "learning_rate": 0.0002, "epoch": 2.2112753147235904, "step": 2020}, {"loss": 0.8973, "grad_norm": 0.59819495677948, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 2030}, {"loss": 0.8009, "grad_norm": 0.5012075304985046, "learning_rate": 0.0002, "epoch": 2.233169129720854, "step": 2040}, {"loss": 0.9087, "grad_norm": 0.44978439807891846, "learning_rate": 0.0002, "epoch": 2.2441160372194853, "step": 2050}, {"loss": 0.823, "grad_norm": 0.5350462198257446, "learning_rate": 0.0002, "epoch": 2.255062944718117, "step": 2060}, {"loss": 0.8335, "grad_norm": 0.6020669937133789, "learning_rate": 0.0002, "epoch": 2.2660098522167487, "step": 2070}, {"loss": 0.8023, "grad_norm": 0.5246821045875549, "learning_rate": 0.0002, "epoch": 2.2769567597153806, "step": 2080}, {"loss": 0.8984, "grad_norm": 0.5711268782615662, "learning_rate": 0.0002, "epoch": 2.287903667214012, "step": 2090}, {"loss": 0.9093, "grad_norm": 0.617317259311676, "learning_rate": 0.0002, "epoch": 2.2988505747126435, "step": 2100}, {"loss": 0.8311, "grad_norm": 0.8608947396278381, "learning_rate": 0.0002, "epoch": 2.3097974822112755, "step": 2110}, {"loss": 0.7839, "grad_norm": 0.4739076793193817, "learning_rate": 0.0002, "epoch": 2.320744389709907, "step": 2120}, {"loss": 0.84, "grad_norm": 0.5538856983184814, "learning_rate": 0.0002, "epoch": 2.3316912972085384, "step": 2130}, {"loss": 0.8994, "grad_norm": 0.6064935326576233, "learning_rate": 0.0002, "epoch": 2.3426382047071703, "step": 2140}, {"loss": 0.7765, "grad_norm": 0.5019068121910095, "learning_rate": 0.0002, "epoch": 2.353585112205802, "step": 2150}, {"loss": 0.9576, "grad_norm": 0.45340514183044434, "learning_rate": 0.0002, "epoch": 2.3645320197044333, "step": 2160}, {"loss": 0.9254, "grad_norm": 0.7347203493118286, "learning_rate": 0.0002, "epoch": 2.375478927203065, "step": 2170}, {"loss": 0.8507, "grad_norm": 0.46922534704208374, "learning_rate": 0.0002, "epoch": 2.3864258347016967, "step": 2180}, {"loss": 0.8325, "grad_norm": 0.5507845878601074, "learning_rate": 0.0002, "epoch": 2.3973727422003286, "step": 2190}, {"loss": 0.8178, "grad_norm": 0.5621911883354187, "learning_rate": 0.0002, "epoch": 2.40831964969896, "step": 2200}, {"loss": 0.8569, "grad_norm": 0.5023514032363892, "learning_rate": 0.0002, "epoch": 2.4192665571975915, "step": 2210}, {"loss": 0.8355, "grad_norm": 0.6124861240386963, "learning_rate": 0.0002, "epoch": 2.4302134646962235, "step": 2220}, {"loss": 0.885, "grad_norm": 0.49614205956459045, "learning_rate": 0.0002, "epoch": 2.441160372194855, "step": 2230}, {"loss": 0.8008, "grad_norm": 0.6477900743484497, "learning_rate": 0.0002, "epoch": 2.4521072796934864, "step": 2240}, {"loss": 0.8622, "grad_norm": 0.5868843793869019, "learning_rate": 0.0002, "epoch": 2.4630541871921183, "step": 2250}, {"loss": 0.8498, "grad_norm": 0.4364610016345978, "learning_rate": 0.0002, "epoch": 2.47400109469075, "step": 2260}, {"loss": 0.8378, "grad_norm": 0.5792964696884155, "learning_rate": 0.0002, "epoch": 2.4849480021893813, "step": 2270}, {"loss": 0.8743, "grad_norm": 0.5421269536018372, "learning_rate": 0.0002, "epoch": 2.495894909688013, "step": 2280}, {"loss": 0.9637, "grad_norm": 0.5525493025779724, "learning_rate": 0.0002, "epoch": 2.5068418171866447, "step": 2290}, {"loss": 0.8075, "grad_norm": 0.6463850140571594, "learning_rate": 0.0002, "epoch": 2.5177887246852766, "step": 2300}, {"loss": 0.8591, "grad_norm": 0.6861311793327332, "learning_rate": 0.0002, "epoch": 2.528735632183908, "step": 2310}, {"loss": 0.9287, "grad_norm": 0.5563555359840393, "learning_rate": 0.0002, "epoch": 2.5396825396825395, "step": 2320}, {"loss": 0.945, "grad_norm": 0.5721169114112854, "learning_rate": 0.0002, "epoch": 2.5506294471811715, "step": 2330}, {"loss": 0.8271, "grad_norm": 0.5258274674415588, "learning_rate": 0.0002, "epoch": 2.561576354679803, "step": 2340}, {"loss": 0.8515, "grad_norm": 0.7057380676269531, "learning_rate": 0.0002, "epoch": 2.572523262178435, "step": 2350}, {"loss": 0.8615, "grad_norm": 0.6869027614593506, "learning_rate": 0.0002, "epoch": 2.5834701696770663, "step": 2360}, {"loss": 0.8043, "grad_norm": 0.4960809648036957, "learning_rate": 0.0002, "epoch": 2.594417077175698, "step": 2370}, {"loss": 0.8476, "grad_norm": 0.9288380146026611, "learning_rate": 0.0002, "epoch": 2.6053639846743293, "step": 2380}, {"loss": 0.873, "grad_norm": 0.3765334188938141, "learning_rate": 0.0002, "epoch": 2.616310892172961, "step": 2390}, {"loss": 0.8764, "grad_norm": 0.7487865686416626, "learning_rate": 0.0002, "epoch": 2.6272577996715927, "step": 2400}, {"loss": 0.7577, "grad_norm": 0.6141156554222107, "learning_rate": 0.0002, "epoch": 2.6382047071702246, "step": 2410}, {"loss": 0.8534, "grad_norm": 0.8420507907867432, "learning_rate": 0.0002, "epoch": 2.649151614668856, "step": 2420}, {"loss": 0.8311, "grad_norm": 0.53386390209198, "learning_rate": 0.0002, "epoch": 2.6600985221674875, "step": 2430}, {"loss": 0.8486, "grad_norm": 0.5520607233047485, "learning_rate": 0.0002, "epoch": 2.6710454296661195, "step": 2440}, {"loss": 0.8686, "grad_norm": 0.5337599515914917, "learning_rate": 0.0002, "epoch": 2.681992337164751, "step": 2450}, {"loss": 0.792, "grad_norm": 0.48790836334228516, "learning_rate": 0.0002, "epoch": 2.692939244663383, "step": 2460}, {"loss": 0.7864, "grad_norm": 0.8287786245346069, "learning_rate": 0.0002, "epoch": 2.7038861521620143, "step": 2470}, {"loss": 0.8244, "grad_norm": 0.5876168608665466, "learning_rate": 0.0002, "epoch": 2.714833059660646, "step": 2480}, {"loss": 0.8773, "grad_norm": 0.5206760764122009, "learning_rate": 0.0002, "epoch": 2.7257799671592773, "step": 2490}, {"loss": 0.8097, "grad_norm": 0.5619136691093445, "learning_rate": 0.0002, "epoch": 2.736726874657909, "step": 2500}, {"loss": 0.8377, "grad_norm": 0.5614883899688721, "learning_rate": 0.0002, "epoch": 2.7476737821565407, "step": 2510}, {"loss": 0.8817, "grad_norm": 0.6157700419425964, "learning_rate": 0.0002, "epoch": 2.7586206896551726, "step": 2520}, {"loss": 0.846, "grad_norm": 0.5529953837394714, "learning_rate": 0.0002, "epoch": 2.769567597153804, "step": 2530}, {"loss": 0.8881, "grad_norm": 0.6731224060058594, "learning_rate": 0.0002, "epoch": 2.7805145046524355, "step": 2540}, {"loss": 0.9125, "grad_norm": 0.6960386633872986, "learning_rate": 0.0002, "epoch": 2.7914614121510675, "step": 2550}, {"loss": 0.7823, "grad_norm": 0.5203493237495422, "learning_rate": 0.0002, "epoch": 2.802408319649699, "step": 2560}, {"loss": 0.7951, "grad_norm": 1.036837100982666, "learning_rate": 0.0002, "epoch": 2.813355227148331, "step": 2570}, {"loss": 0.8427, "grad_norm": 0.6125805377960205, "learning_rate": 0.0002, "epoch": 2.8243021346469623, "step": 2580}, {"loss": 0.8939, "grad_norm": 0.6298092603683472, "learning_rate": 0.0002, "epoch": 2.835249042145594, "step": 2590}, {"loss": 0.8845, "grad_norm": 0.5882203578948975, "learning_rate": 0.0002, "epoch": 2.8461959496442253, "step": 2600}, {"loss": 0.8921, "grad_norm": 0.8619399666786194, "learning_rate": 0.0002, "epoch": 2.857142857142857, "step": 2610}, {"loss": 0.8213, "grad_norm": 0.4722687304019928, "learning_rate": 0.0002, "epoch": 2.8680897646414887, "step": 2620}, {"loss": 0.9147, "grad_norm": 0.47399574518203735, "learning_rate": 0.0002, "epoch": 2.8790366721401206, "step": 2630}, {"loss": 0.8325, "grad_norm": 0.5639172792434692, "learning_rate": 0.0002, "epoch": 2.889983579638752, "step": 2640}, {"loss": 0.8653, "grad_norm": 0.4676816761493683, "learning_rate": 0.0002, "epoch": 2.9009304871373836, "step": 2650}, {"loss": 0.8966, "grad_norm": 0.6906291246414185, "learning_rate": 0.0002, "epoch": 2.9118773946360155, "step": 2660}, {"loss": 0.8966, "grad_norm": 0.4369746148586273, "learning_rate": 0.0002, "epoch": 2.922824302134647, "step": 2670}, {"loss": 0.9173, "grad_norm": 0.46423083543777466, "learning_rate": 0.0002, "epoch": 2.933771209633279, "step": 2680}, {"loss": 0.7961, "grad_norm": 0.5700525045394897, "learning_rate": 0.0002, "epoch": 2.9447181171319103, "step": 2690}, {"loss": 0.8172, "grad_norm": 0.6221476793289185, "learning_rate": 0.0002, "epoch": 2.955665024630542, "step": 2700}, {"loss": 0.8538, "grad_norm": 0.6102682948112488, "learning_rate": 0.0002, "epoch": 2.9666119321291733, "step": 2710}, {"loss": 0.9779, "grad_norm": 0.5317878723144531, "learning_rate": 0.0002, "epoch": 2.977558839627805, "step": 2720}, {"loss": 0.9314, "grad_norm": 0.4438510835170746, "learning_rate": 0.0002, "epoch": 2.9885057471264367, "step": 2730}, {"loss": 0.8553, "grad_norm": 0.5022130012512207, "learning_rate": 0.0002, "epoch": 2.9994526546250686, "step": 2740}, {"eval_loss": 1.1722838878631592, "eval_runtime": 46.0829, "eval_samples_per_second": 9.461, "eval_steps_per_second": 1.194, "epoch": 2.9994526546250686, "step": 2740}, {"loss": 0.6443, "grad_norm": 0.6384502053260803, "learning_rate": 0.0002, "epoch": 3.0103995621237, "step": 2750}, {"loss": 0.7123, "grad_norm": 0.9928722381591797, "learning_rate": 0.0002, "epoch": 3.0213464696223316, "step": 2760}, {"loss": 0.6045, "grad_norm": 0.7813051342964172, "learning_rate": 0.0002, "epoch": 3.0322933771209635, "step": 2770}, {"loss": 0.6042, "grad_norm": 1.0202556848526, "learning_rate": 0.0002, "epoch": 3.043240284619595, "step": 2780}, {"loss": 0.6356, "grad_norm": 0.7581062316894531, "learning_rate": 0.0002, "epoch": 3.0541871921182264, "step": 2790}, {"loss": 0.6349, "grad_norm": 0.6252710223197937, "learning_rate": 0.0002, "epoch": 3.0651340996168583, "step": 2800}, {"loss": 0.645, "grad_norm": 0.7738662958145142, "learning_rate": 0.0002, "epoch": 3.07608100711549, "step": 2810}, {"loss": 0.627, "grad_norm": 0.7381885051727295, "learning_rate": 0.0002, "epoch": 3.0870279146141213, "step": 2820}, {"loss": 0.6371, "grad_norm": 0.9197564721107483, "learning_rate": 0.0002, "epoch": 3.097974822112753, "step": 2830}, {"loss": 0.723, "grad_norm": 1.000976800918579, "learning_rate": 0.0002, "epoch": 3.1089217296113847, "step": 2840}, {"loss": 0.6631, "grad_norm": 0.7559131383895874, "learning_rate": 0.0002, "epoch": 3.1198686371100166, "step": 2850}, {"loss": 0.6252, "grad_norm": 0.7213780879974365, "learning_rate": 0.0002, "epoch": 3.130815544608648, "step": 2860}, {"loss": 0.6501, "grad_norm": 0.945939838886261, "learning_rate": 0.0002, "epoch": 3.1417624521072796, "step": 2870}, {"loss": 0.6129, "grad_norm": 0.7277454137802124, "learning_rate": 0.0002, "epoch": 3.1527093596059115, "step": 2880}, {"loss": 0.6423, "grad_norm": 0.762026846408844, "learning_rate": 0.0002, "epoch": 3.163656267104543, "step": 2890}, {"loss": 0.5332, "grad_norm": 0.6471221446990967, "learning_rate": 0.0002, "epoch": 3.1746031746031744, "step": 2900}, {"loss": 0.7981, "grad_norm": 0.6018978357315063, "learning_rate": 0.0002, "epoch": 3.1855500821018063, "step": 2910}, {"loss": 0.7274, "grad_norm": 0.8607320785522461, "learning_rate": 0.0002, "epoch": 3.196496989600438, "step": 2920}, {"loss": 0.6139, "grad_norm": 0.8854126334190369, "learning_rate": 0.0002, "epoch": 3.2074438970990693, "step": 2930}, {"loss": 0.6485, "grad_norm": 0.6620870232582092, "learning_rate": 0.0002, "epoch": 3.218390804597701, "step": 2940}, {"loss": 0.6969, "grad_norm": 0.7377511858940125, "learning_rate": 0.0002, "epoch": 3.2293377120963327, "step": 2950}, {"loss": 0.6798, "grad_norm": 0.7803301811218262, "learning_rate": 0.0002, "epoch": 3.2402846195949646, "step": 2960}, {"loss": 0.6697, "grad_norm": 0.834061861038208, "learning_rate": 0.0002, "epoch": 3.251231527093596, "step": 2970}, {"loss": 0.6894, "grad_norm": 0.8496041893959045, "learning_rate": 0.0002, "epoch": 3.2621784345922276, "step": 2980}, {"loss": 0.6591, "grad_norm": 0.7967984676361084, "learning_rate": 0.0002, "epoch": 3.2731253420908595, "step": 2990}, {"loss": 0.7266, "grad_norm": 1.0207016468048096, "learning_rate": 0.0002, "epoch": 3.284072249589491, "step": 3000}, {"loss": 0.6586, "grad_norm": 0.7855771780014038, "learning_rate": 0.0002, "epoch": 3.2950191570881224, "step": 3010}, {"loss": 0.5711, "grad_norm": 0.9427546858787537, "learning_rate": 0.0002, "epoch": 3.3059660645867543, "step": 3020}, {"loss": 0.6277, "grad_norm": 0.823542594909668, "learning_rate": 0.0002, "epoch": 3.316912972085386, "step": 3030}, {"loss": 0.7109, "grad_norm": 0.9826635122299194, "learning_rate": 0.0002, "epoch": 3.3278598795840173, "step": 3040}, {"loss": 0.6564, "grad_norm": 0.7259827852249146, "learning_rate": 0.0002, "epoch": 3.338806787082649, "step": 3050}, {"loss": 0.653, "grad_norm": 0.7774739861488342, "learning_rate": 0.0002, "epoch": 3.3497536945812807, "step": 3060}, {"loss": 0.7529, "grad_norm": 0.7394293546676636, "learning_rate": 0.0002, "epoch": 3.3607006020799126, "step": 3070}, {"loss": 0.5987, "grad_norm": 0.9017578959465027, "learning_rate": 0.0002, "epoch": 3.371647509578544, "step": 3080}, {"loss": 0.6953, "grad_norm": 0.7451054453849792, "learning_rate": 0.0002, "epoch": 3.3825944170771756, "step": 3090}, {"loss": 0.6759, "grad_norm": 0.7321506142616272, "learning_rate": 0.0002, "epoch": 3.3935413245758075, "step": 3100}, {"loss": 0.6555, "grad_norm": 0.6721828579902649, "learning_rate": 0.0002, "epoch": 3.404488232074439, "step": 3110}, {"loss": 0.6559, "grad_norm": 0.774022102355957, "learning_rate": 0.0002, "epoch": 3.4154351395730704, "step": 3120}, {"loss": 0.7449, "grad_norm": 0.9143537282943726, "learning_rate": 0.0002, "epoch": 3.4263820470717024, "step": 3130}, {"loss": 0.6899, "grad_norm": 1.226087212562561, "learning_rate": 0.0002, "epoch": 3.437328954570334, "step": 3140}, {"loss": 0.6719, "grad_norm": 0.7545496225357056, "learning_rate": 0.0002, "epoch": 3.4482758620689653, "step": 3150}, {"loss": 0.6153, "grad_norm": 0.6515635848045349, "learning_rate": 0.0002, "epoch": 3.4592227695675972, "step": 3160}, {"loss": 0.6926, "grad_norm": 0.9297090172767639, "learning_rate": 0.0002, "epoch": 3.4701696770662287, "step": 3170}, {"loss": 0.6071, "grad_norm": 1.0130730867385864, "learning_rate": 0.0002, "epoch": 3.4811165845648606, "step": 3180}, {"loss": 0.5959, "grad_norm": 0.7654589414596558, "learning_rate": 0.0002, "epoch": 3.492063492063492, "step": 3190}, {"loss": 0.7401, "grad_norm": 0.9954977631568909, "learning_rate": 0.0002, "epoch": 3.5030103995621236, "step": 3200}, {"loss": 0.6661, "grad_norm": 0.6027487516403198, "learning_rate": 0.0002, "epoch": 3.5139573070607555, "step": 3210}, {"loss": 0.6963, "grad_norm": 0.741770327091217, "learning_rate": 0.0002, "epoch": 3.524904214559387, "step": 3220}, {"loss": 0.8112, "grad_norm": 1.0534909963607788, "learning_rate": 0.0002, "epoch": 3.535851122058019, "step": 3230}, {"loss": 0.6813, "grad_norm": 0.937772274017334, "learning_rate": 0.0002, "epoch": 3.5467980295566504, "step": 3240}, {"loss": 0.6681, "grad_norm": 0.8504213690757751, "learning_rate": 0.0002, "epoch": 3.557744937055282, "step": 3250}, {"loss": 0.6436, "grad_norm": 0.7755007147789001, "learning_rate": 0.0002, "epoch": 3.5686918445539133, "step": 3260}, {"loss": 0.6213, "grad_norm": 1.0193358659744263, "learning_rate": 0.0002, "epoch": 3.5796387520525452, "step": 3270}, {"loss": 0.671, "grad_norm": 0.8440536856651306, "learning_rate": 0.0002, "epoch": 3.5905856595511767, "step": 3280}, {"loss": 0.6859, "grad_norm": 0.6195939183235168, "learning_rate": 0.0002, "epoch": 3.6015325670498086, "step": 3290}, {"loss": 0.7446, "grad_norm": 0.8608590960502625, "learning_rate": 0.0002, "epoch": 3.61247947454844, "step": 3300}, {"loss": 0.7301, "grad_norm": 0.6772327423095703, "learning_rate": 0.0002, "epoch": 3.6234263820470716, "step": 3310}, {"loss": 0.6298, "grad_norm": 0.8031839728355408, "learning_rate": 0.0002, "epoch": 3.6343732895457035, "step": 3320}, {"loss": 0.7041, "grad_norm": 0.6080502271652222, "learning_rate": 0.0002, "epoch": 3.645320197044335, "step": 3330}, {"loss": 0.7431, "grad_norm": 0.8007240891456604, "learning_rate": 0.0002, "epoch": 3.656267104542967, "step": 3340}, {"loss": 0.7446, "grad_norm": 0.8060704469680786, "learning_rate": 0.0002, "epoch": 3.6672140120415984, "step": 3350}, {"loss": 0.6304, "grad_norm": 0.7547586560249329, "learning_rate": 0.0002, "epoch": 3.67816091954023, "step": 3360}, {"loss": 0.7066, "grad_norm": 0.686851978302002, "learning_rate": 0.0002, "epoch": 3.6891078270388613, "step": 3370}, {"loss": 0.6748, "grad_norm": 0.9429075717926025, "learning_rate": 0.0002, "epoch": 3.7000547345374932, "step": 3380}, {"loss": 0.6673, "grad_norm": 0.7283591032028198, "learning_rate": 0.0002, "epoch": 3.7110016420361247, "step": 3390}, {"loss": 0.7502, "grad_norm": 0.8323085904121399, "learning_rate": 0.0002, "epoch": 3.7219485495347566, "step": 3400}, {"loss": 0.7779, "grad_norm": 0.8529590964317322, "learning_rate": 0.0002, "epoch": 3.732895457033388, "step": 3410}, {"loss": 0.6555, "grad_norm": 0.731752872467041, "learning_rate": 0.0002, "epoch": 3.7438423645320196, "step": 3420}, {"loss": 0.6928, "grad_norm": 0.8572278618812561, "learning_rate": 0.0002, "epoch": 3.7547892720306515, "step": 3430}, {"loss": 0.6215, "grad_norm": 0.7408691048622131, "learning_rate": 0.0002, "epoch": 3.765736179529283, "step": 3440}, {"loss": 0.622, "grad_norm": 0.7470445036888123, "learning_rate": 0.0002, "epoch": 3.776683087027915, "step": 3450}, {"loss": 0.7241, "grad_norm": 0.6806244254112244, "learning_rate": 0.0002, "epoch": 3.7876299945265464, "step": 3460}, {"loss": 0.7739, "grad_norm": 0.9129069447517395, "learning_rate": 0.0002, "epoch": 3.798576902025178, "step": 3470}, {"loss": 0.6826, "grad_norm": 0.8717501759529114, "learning_rate": 0.0002, "epoch": 3.8095238095238093, "step": 3480}, {"loss": 0.6188, "grad_norm": 0.6761979460716248, "learning_rate": 0.0002, "epoch": 3.8204707170224412, "step": 3490}, {"loss": 0.7601, "grad_norm": 1.0054380893707275, "learning_rate": 0.0002, "epoch": 3.8314176245210727, "step": 3500}, {"loss": 0.623, "grad_norm": 1.1224009990692139, "learning_rate": 0.0002, "epoch": 3.8423645320197046, "step": 3510}, {"loss": 0.6918, "grad_norm": 0.8997692465782166, "learning_rate": 0.0002, "epoch": 3.853311439518336, "step": 3520}, {"loss": 0.6357, "grad_norm": 1.0086902379989624, "learning_rate": 0.0002, "epoch": 3.8642583470169676, "step": 3530}, {"loss": 0.6379, "grad_norm": 0.772739589214325, "learning_rate": 0.0002, "epoch": 3.8752052545155995, "step": 3540}, {"loss": 0.7423, "grad_norm": 1.211774230003357, "learning_rate": 0.0002, "epoch": 3.886152162014231, "step": 3550}, {"loss": 0.7321, "grad_norm": 0.9572356939315796, "learning_rate": 0.0002, "epoch": 3.897099069512863, "step": 3560}, {"loss": 0.6836, "grad_norm": 0.7887842655181885, "learning_rate": 0.0002, "epoch": 3.9080459770114944, "step": 3570}, {"loss": 0.7576, "grad_norm": 0.7308389544487, "learning_rate": 0.0002, "epoch": 3.918992884510126, "step": 3580}, {"loss": 0.6001, "grad_norm": 1.0182650089263916, "learning_rate": 0.0002, "epoch": 3.9299397920087573, "step": 3590}, {"loss": 0.6942, "grad_norm": 0.8000147342681885, "learning_rate": 0.0002, "epoch": 3.9408866995073892, "step": 3600}, {"loss": 0.6244, "grad_norm": 0.7385728359222412, "learning_rate": 0.0002, "epoch": 3.9518336070060207, "step": 3610}, {"loss": 0.6718, "grad_norm": 0.9233261942863464, "learning_rate": 0.0002, "epoch": 3.9627805145046526, "step": 3620}, {"loss": 0.6508, "grad_norm": 0.8486751914024353, "learning_rate": 0.0002, "epoch": 3.973727422003284, "step": 3630}, {"loss": 0.6928, "grad_norm": 0.7593663334846497, "learning_rate": 0.0002, "epoch": 3.9846743295019156, "step": 3640}, {"loss": 0.6847, "grad_norm": 0.7885415554046631, "learning_rate": 0.0002, "epoch": 3.9956212370005475, "step": 3650}, {"eval_loss": 1.250312328338623, "eval_runtime": 46.0842, "eval_samples_per_second": 9.461, "eval_steps_per_second": 1.193, "epoch": 4.0, "step": 3654}, {"loss": 0.5547, "grad_norm": 0.6591703295707703, "learning_rate": 0.0002, "epoch": 4.006568144499179, "step": 3660}, {"loss": 0.5301, "grad_norm": 1.36927330493927, "learning_rate": 0.0002, "epoch": 4.017515051997811, "step": 3670}, {"loss": 0.4466, "grad_norm": 0.8106328845024109, "learning_rate": 0.0002, "epoch": 4.028461959496442, "step": 3680}, {"loss": 0.4861, "grad_norm": 0.7592712044715881, "learning_rate": 0.0002, "epoch": 4.039408866995074, "step": 3690}, {"loss": 0.5103, "grad_norm": 0.9518909454345703, "learning_rate": 0.0002, "epoch": 4.050355774493705, "step": 3700}, {"loss": 0.4638, "grad_norm": 0.7805967330932617, "learning_rate": 0.0002, "epoch": 4.061302681992337, "step": 3710}, {"loss": 0.4556, "grad_norm": 1.3146334886550903, "learning_rate": 0.0002, "epoch": 4.072249589490969, "step": 3720}, {"loss": 0.5635, "grad_norm": 1.1611138582229614, "learning_rate": 0.0002, "epoch": 4.083196496989601, "step": 3730}, {"loss": 0.3845, "grad_norm": 0.8173232078552246, "learning_rate": 0.0002, "epoch": 4.094143404488232, "step": 3740}, {"loss": 0.4911, "grad_norm": 0.7848323583602905, "learning_rate": 0.0002, "epoch": 4.105090311986864, "step": 3750}, {"loss": 0.4519, "grad_norm": 1.3183201551437378, "learning_rate": 0.0002, "epoch": 4.116037219485495, "step": 3760}, {"loss": 0.5083, "grad_norm": 1.1936529874801636, "learning_rate": 0.0002, "epoch": 4.1269841269841265, "step": 3770}, {"loss": 0.5208, "grad_norm": 1.1078993082046509, "learning_rate": 0.0002, "epoch": 4.137931034482759, "step": 3780}, {"loss": 0.5928, "grad_norm": 1.107743263244629, "learning_rate": 0.0002, "epoch": 4.14887794198139, "step": 3790}, {"loss": 0.5112, "grad_norm": 0.7801875472068787, "learning_rate": 0.0002, "epoch": 4.159824849480022, "step": 3800}, {"loss": 0.4896, "grad_norm": 1.1328117847442627, "learning_rate": 0.0002, "epoch": 4.170771756978653, "step": 3810}, {"loss": 0.5645, "grad_norm": 1.4232193231582642, "learning_rate": 0.0002, "epoch": 4.181718664477285, "step": 3820}, {"loss": 0.5049, "grad_norm": 1.557416558265686, "learning_rate": 0.0002, "epoch": 4.192665571975917, "step": 3830}, {"loss": 0.4863, "grad_norm": 1.042923092842102, "learning_rate": 0.0002, "epoch": 4.203612479474549, "step": 3840}, {"loss": 0.3751, "grad_norm": 1.1801949739456177, "learning_rate": 0.0002, "epoch": 4.21455938697318, "step": 3850}, {"loss": 0.5063, "grad_norm": 0.9273753762245178, "learning_rate": 0.0002, "epoch": 4.225506294471812, "step": 3860}, {"loss": 0.5542, "grad_norm": 0.7681763768196106, "learning_rate": 0.0002, "epoch": 4.236453201970443, "step": 3870}, {"loss": 0.5971, "grad_norm": 0.9840841293334961, "learning_rate": 0.0002, "epoch": 4.2474001094690745, "step": 3880}, {"loss": 0.4648, "grad_norm": 1.0290725231170654, "learning_rate": 0.0002, "epoch": 4.258347016967707, "step": 3890}, {"loss": 0.4288, "grad_norm": 0.8059597611427307, "learning_rate": 0.0002, "epoch": 4.269293924466338, "step": 3900}, {"loss": 0.5103, "grad_norm": 0.9847467541694641, "learning_rate": 0.0002, "epoch": 4.28024083196497, "step": 3910}, {"loss": 0.4952, "grad_norm": 1.344044804573059, "learning_rate": 0.0002, "epoch": 4.291187739463601, "step": 3920}, {"loss": 0.4966, "grad_norm": 0.9174224138259888, "learning_rate": 0.0002, "epoch": 4.302134646962233, "step": 3930}, {"loss": 0.4944, "grad_norm": 1.1199711561203003, "learning_rate": 0.0002, "epoch": 4.313081554460865, "step": 3940}, {"loss": 0.4641, "grad_norm": 1.0120296478271484, "learning_rate": 0.0002, "epoch": 4.324028461959497, "step": 3950}, {"loss": 0.4723, "grad_norm": 1.091811180114746, "learning_rate": 0.0002, "epoch": 4.334975369458128, "step": 3960}, {"loss": 0.4627, "grad_norm": 1.0332133769989014, "learning_rate": 0.0002, "epoch": 4.34592227695676, "step": 3970}, {"loss": 0.4646, "grad_norm": 1.0785295963287354, "learning_rate": 0.0002, "epoch": 4.356869184455391, "step": 3980}, {"loss": 0.4909, "grad_norm": 1.0506969690322876, "learning_rate": 0.0002, "epoch": 4.3678160919540225, "step": 3990}, {"loss": 0.4776, "grad_norm": 1.047560691833496, "learning_rate": 0.0002, "epoch": 4.378762999452655, "step": 4000}, {"loss": 0.4549, "grad_norm": 0.9348800778388977, "learning_rate": 0.0002, "epoch": 4.389709906951286, "step": 4010}, {"loss": 0.5333, "grad_norm": 1.1563059091567993, "learning_rate": 0.0002, "epoch": 4.400656814449918, "step": 4020}, {"loss": 0.4952, "grad_norm": 1.001470923423767, "learning_rate": 0.0002, "epoch": 4.411603721948549, "step": 4030}, {"loss": 0.4972, "grad_norm": 1.309012532234192, "learning_rate": 0.0002, "epoch": 4.422550629447181, "step": 4040}, {"loss": 0.5078, "grad_norm": 0.7338925004005432, "learning_rate": 0.0002, "epoch": 4.433497536945813, "step": 4050}, {"loss": 0.4632, "grad_norm": 1.0398834943771362, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 4060}, {"loss": 0.6285, "grad_norm": 0.9728689193725586, "learning_rate": 0.0002, "epoch": 4.455391351943076, "step": 4070}, {"loss": 0.4778, "grad_norm": 1.247475028038025, "learning_rate": 0.0002, "epoch": 4.466338259441708, "step": 4080}, {"loss": 0.4813, "grad_norm": 1.1084578037261963, "learning_rate": 0.0002, "epoch": 4.477285166940339, "step": 4090}, {"loss": 0.5665, "grad_norm": 1.1619318723678589, "learning_rate": 0.0002, "epoch": 4.4882320744389705, "step": 4100}, {"loss": 0.5207, "grad_norm": 1.3456498384475708, "learning_rate": 0.0002, "epoch": 4.499178981937603, "step": 4110}, {"loss": 0.4876, "grad_norm": 0.9372991323471069, "learning_rate": 0.0002, "epoch": 4.510125889436234, "step": 4120}, {"loss": 0.5456, "grad_norm": 1.0071815252304077, "learning_rate": 0.0002, "epoch": 4.521072796934866, "step": 4130}, {"loss": 0.5589, "grad_norm": 1.190344214439392, "learning_rate": 0.0002, "epoch": 4.532019704433497, "step": 4140}, {"loss": 0.4852, "grad_norm": 0.9480887055397034, "learning_rate": 0.0002, "epoch": 4.542966611932129, "step": 4150}, {"loss": 0.5229, "grad_norm": 1.0252189636230469, "learning_rate": 0.0002, "epoch": 4.553913519430761, "step": 4160}, {"loss": 0.5253, "grad_norm": 0.7142013311386108, "learning_rate": 0.0002, "epoch": 4.564860426929393, "step": 4170}, {"loss": 0.4861, "grad_norm": 0.8937426805496216, "learning_rate": 0.0002, "epoch": 4.575807334428024, "step": 4180}, {"loss": 0.4773, "grad_norm": 0.8885005116462708, "learning_rate": 0.0002, "epoch": 4.586754241926656, "step": 4190}, {"loss": 0.4858, "grad_norm": 1.337663173675537, "learning_rate": 0.0002, "epoch": 4.597701149425287, "step": 4200}, {"loss": 0.5247, "grad_norm": 1.0475375652313232, "learning_rate": 0.0002, "epoch": 4.6086480569239185, "step": 4210}, {"loss": 0.5298, "grad_norm": 1.0081088542938232, "learning_rate": 0.0002, "epoch": 4.619594964422551, "step": 4220}, {"loss": 0.5042, "grad_norm": 0.7527595162391663, "learning_rate": 0.0002, "epoch": 4.630541871921182, "step": 4230}, {"loss": 0.5207, "grad_norm": 1.55559241771698, "learning_rate": 0.0002, "epoch": 4.641488779419814, "step": 4240}, {"loss": 0.5468, "grad_norm": 0.7967379689216614, "learning_rate": 0.0002, "epoch": 4.652435686918445, "step": 4250}, {"loss": 0.5328, "grad_norm": 0.898368775844574, "learning_rate": 0.0002, "epoch": 4.663382594417077, "step": 4260}, {"loss": 0.4706, "grad_norm": 1.1940776109695435, "learning_rate": 0.0002, "epoch": 4.674329501915709, "step": 4270}, {"loss": 0.5121, "grad_norm": 1.1817092895507812, "learning_rate": 0.0002, "epoch": 4.685276409414341, "step": 4280}, {"loss": 0.5758, "grad_norm": 0.9041520357131958, "learning_rate": 0.0002, "epoch": 4.696223316912972, "step": 4290}, {"loss": 0.5851, "grad_norm": 1.1280102729797363, "learning_rate": 0.0002, "epoch": 4.707170224411604, "step": 4300}, {"loss": 0.4891, "grad_norm": 1.357689619064331, "learning_rate": 0.0002, "epoch": 4.718117131910235, "step": 4310}, {"loss": 0.4704, "grad_norm": 1.056633472442627, "learning_rate": 0.0002, "epoch": 4.7290640394088665, "step": 4320}, {"loss": 0.5488, "grad_norm": 1.6520427465438843, "learning_rate": 0.0002, "epoch": 4.740010946907499, "step": 4330}, {"loss": 0.5131, "grad_norm": 1.153200626373291, "learning_rate": 0.0002, "epoch": 4.75095785440613, "step": 4340}, {"loss": 0.539, "grad_norm": 0.9346241354942322, "learning_rate": 0.0002, "epoch": 4.761904761904762, "step": 4350}, {"loss": 0.4941, "grad_norm": 0.8628455996513367, "learning_rate": 0.0002, "epoch": 4.772851669403393, "step": 4360}, {"loss": 0.5167, "grad_norm": 1.3843916654586792, "learning_rate": 0.0002, "epoch": 4.783798576902025, "step": 4370}, {"loss": 0.4683, "grad_norm": 1.035574197769165, "learning_rate": 0.0002, "epoch": 4.794745484400657, "step": 4380}, {"loss": 0.5162, "grad_norm": 1.1868361234664917, "learning_rate": 0.0002, "epoch": 4.805692391899289, "step": 4390}, {"loss": 0.534, "grad_norm": 1.1307647228240967, "learning_rate": 0.0002, "epoch": 4.81663929939792, "step": 4400}, {"loss": 0.5567, "grad_norm": 0.9787724614143372, "learning_rate": 0.0002, "epoch": 4.827586206896552, "step": 4410}, {"loss": 0.5185, "grad_norm": 1.0473824739456177, "learning_rate": 0.0002, "epoch": 4.838533114395183, "step": 4420}, {"loss": 0.6285, "grad_norm": 1.069069504737854, "learning_rate": 0.0002, "epoch": 4.8494800218938146, "step": 4430}, {"loss": 0.5267, "grad_norm": 1.4305680990219116, "learning_rate": 0.0002, "epoch": 4.860426929392447, "step": 4440}, {"loss": 0.5947, "grad_norm": 1.3679203987121582, "learning_rate": 0.0002, "epoch": 4.871373836891078, "step": 4450}, {"loss": 0.5135, "grad_norm": 0.8997844457626343, "learning_rate": 0.0002, "epoch": 4.88232074438971, "step": 4460}, {"loss": 0.5312, "grad_norm": 1.2758110761642456, "learning_rate": 0.0002, "epoch": 4.893267651888341, "step": 4470}, {"loss": 0.4914, "grad_norm": 0.8819465637207031, "learning_rate": 0.0002, "epoch": 4.904214559386973, "step": 4480}, {"loss": 0.5147, "grad_norm": 1.08329439163208, "learning_rate": 0.0002, "epoch": 4.915161466885605, "step": 4490}, {"loss": 0.5404, "grad_norm": 1.083461046218872, "learning_rate": 0.0002, "epoch": 4.926108374384237, "step": 4500}, {"loss": 0.5433, "grad_norm": 1.2387723922729492, "learning_rate": 0.0002, "epoch": 4.937055281882868, "step": 4510}, {"loss": 0.5624, "grad_norm": 0.8262293934822083, "learning_rate": 0.0002, "epoch": 4.9480021893815, "step": 4520}, {"loss": 0.504, "grad_norm": 1.2325191497802734, "learning_rate": 0.0002, "epoch": 4.958949096880131, "step": 4530}, {"loss": 0.5452, "grad_norm": 1.024614930152893, "learning_rate": 0.0002, "epoch": 4.9698960043787626, "step": 4540}, {"loss": 0.4752, "grad_norm": 1.3007521629333496, "learning_rate": 0.0002, "epoch": 4.980842911877395, "step": 4550}, {"loss": 0.4943, "grad_norm": 0.9823828339576721, "learning_rate": 0.0002, "epoch": 4.991789819376026, "step": 4560}, {"eval_loss": 1.3920727968215942, "eval_runtime": 46.0764, "eval_samples_per_second": 9.463, "eval_steps_per_second": 1.194, "epoch": 4.999452654625069, "step": 4567}, {"loss": 0.545, "grad_norm": 1.1478906869888306, "learning_rate": 0.0002, "epoch": 5.002736726874658, "step": 4570}, {"loss": 0.372, "grad_norm": 1.0533705949783325, "learning_rate": 0.0002, "epoch": 5.013683634373289, "step": 4580}, {"loss": 0.3313, "grad_norm": 1.268900752067566, "learning_rate": 0.0002, "epoch": 5.024630541871921, "step": 4590}, {"loss": 0.3482, "grad_norm": 1.222652554512024, "learning_rate": 0.0002, "epoch": 5.035577449370553, "step": 4600}, {"loss": 0.3195, "grad_norm": 1.5093127489089966, "learning_rate": 0.0002, "epoch": 5.046524356869185, "step": 4610}, {"loss": 0.3569, "grad_norm": 1.2372499704360962, "learning_rate": 0.0002, "epoch": 5.057471264367816, "step": 4620}, {"loss": 0.3206, "grad_norm": 0.8422666192054749, "learning_rate": 0.0002, "epoch": 5.068418171866448, "step": 4630}, {"loss": 0.3115, "grad_norm": 1.1451770067214966, "learning_rate": 0.0002, "epoch": 5.079365079365079, "step": 4640}, {"loss": 0.3305, "grad_norm": 1.2074557542800903, "learning_rate": 0.0002, "epoch": 5.090311986863711, "step": 4650}, {"loss": 0.3012, "grad_norm": 1.429150104522705, "learning_rate": 0.0002, "epoch": 5.101258894362343, "step": 4660}, {"loss": 0.3229, "grad_norm": 1.0353610515594482, "learning_rate": 0.0002, "epoch": 5.112205801860974, "step": 4670}, {"loss": 0.402, "grad_norm": 1.2845979928970337, "learning_rate": 0.0002, "epoch": 5.123152709359606, "step": 4680}, {"loss": 0.383, "grad_norm": 1.3790186643600464, "learning_rate": 0.0002, "epoch": 5.134099616858237, "step": 4690}, {"loss": 0.2951, "grad_norm": 1.3182239532470703, "learning_rate": 0.0002, "epoch": 5.145046524356869, "step": 4700}, {"loss": 0.4074, "grad_norm": 1.5249626636505127, "learning_rate": 0.0002, "epoch": 5.155993431855501, "step": 4710}, {"loss": 0.3703, "grad_norm": 1.2492733001708984, "learning_rate": 0.0002, "epoch": 5.166940339354133, "step": 4720}, {"loss": 0.3411, "grad_norm": 1.4455480575561523, "learning_rate": 0.0002, "epoch": 5.177887246852764, "step": 4730}, {"loss": 0.3996, "grad_norm": 1.2191482782363892, "learning_rate": 0.0002, "epoch": 5.188834154351396, "step": 4740}, {"loss": 0.3785, "grad_norm": 1.4707951545715332, "learning_rate": 0.0002, "epoch": 5.199781061850027, "step": 4750}, {"loss": 0.3516, "grad_norm": 1.3473678827285767, "learning_rate": 0.0002, "epoch": 5.210727969348659, "step": 4760}, {"loss": 0.3266, "grad_norm": 1.0479670763015747, "learning_rate": 0.0002, "epoch": 5.221674876847291, "step": 4770}, {"loss": 0.3976, "grad_norm": 1.299096703529358, "learning_rate": 0.0002, "epoch": 5.232621784345922, "step": 4780}, {"loss": 0.3266, "grad_norm": 1.2820168733596802, "learning_rate": 0.0002, "epoch": 5.243568691844554, "step": 4790}, {"loss": 0.3347, "grad_norm": 1.3818004131317139, "learning_rate": 0.0002, "epoch": 5.254515599343185, "step": 4800}, {"loss": 0.3761, "grad_norm": 1.2898736000061035, "learning_rate": 0.0002, "epoch": 5.265462506841817, "step": 4810}, {"loss": 0.3694, "grad_norm": 1.1761468648910522, "learning_rate": 0.0002, "epoch": 5.276409414340449, "step": 4820}, {"loss": 0.3806, "grad_norm": 1.7155952453613281, "learning_rate": 0.0002, "epoch": 5.287356321839081, "step": 4830}, {"loss": 0.322, "grad_norm": 0.9103642106056213, "learning_rate": 0.0002, "epoch": 5.298303229337712, "step": 4840}, {"loss": 0.3516, "grad_norm": 1.013015627861023, "learning_rate": 0.0002, "epoch": 5.309250136836344, "step": 4850}, {"loss": 0.4297, "grad_norm": 1.390471339225769, "learning_rate": 0.0002, "epoch": 5.320197044334975, "step": 4860}, {"loss": 0.4098, "grad_norm": 1.129770278930664, "learning_rate": 0.0002, "epoch": 5.331143951833607, "step": 4870}, {"loss": 0.4227, "grad_norm": 1.1461067199707031, "learning_rate": 0.0002, "epoch": 5.342090859332239, "step": 4880}, {"loss": 0.288, "grad_norm": 1.3587424755096436, "learning_rate": 0.0002, "epoch": 5.35303776683087, "step": 4890}, {"loss": 0.3604, "grad_norm": 1.6897879838943481, "learning_rate": 0.0002, "epoch": 5.363984674329502, "step": 4900}, {"loss": 0.3887, "grad_norm": 0.9298055768013, "learning_rate": 0.0002, "epoch": 5.374931581828133, "step": 4910}, {"loss": 0.3371, "grad_norm": 1.0006917715072632, "learning_rate": 0.0002, "epoch": 5.385878489326765, "step": 4920}, {"loss": 0.3992, "grad_norm": 1.232581377029419, "learning_rate": 0.0002, "epoch": 5.396825396825397, "step": 4930}, {"loss": 0.3456, "grad_norm": 1.0822620391845703, "learning_rate": 0.0002, "epoch": 5.407772304324029, "step": 4940}, {"loss": 0.3806, "grad_norm": 1.3648720979690552, "learning_rate": 0.0002, "epoch": 5.41871921182266, "step": 4950}, {"loss": 0.3959, "grad_norm": 1.3220354318618774, "learning_rate": 0.0002, "epoch": 5.429666119321292, "step": 4960}, {"loss": 0.3278, "grad_norm": 1.1106271743774414, "learning_rate": 0.0002, "epoch": 5.440613026819923, "step": 4970}, {"loss": 0.3812, "grad_norm": 1.6058908700942993, "learning_rate": 0.0002, "epoch": 5.451559934318555, "step": 4980}, {"loss": 0.3905, "grad_norm": 1.1065930128097534, "learning_rate": 0.0002, "epoch": 5.462506841817187, "step": 4990}, {"loss": 0.4058, "grad_norm": 1.3896466493606567, "learning_rate": 0.0002, "epoch": 5.473453749315818, "step": 5000}, {"loss": 0.4122, "grad_norm": 1.0437148809432983, "learning_rate": 0.0002, "epoch": 5.48440065681445, "step": 5010}, {"loss": 0.4065, "grad_norm": 1.2347718477249146, "learning_rate": 0.0002, "epoch": 5.495347564313081, "step": 5020}, {"loss": 0.3586, "grad_norm": 1.1174284219741821, "learning_rate": 0.0002, "epoch": 5.506294471811713, "step": 5030}, {"loss": 0.3576, "grad_norm": 1.2580941915512085, "learning_rate": 0.0002, "epoch": 5.517241379310345, "step": 5040}, {"loss": 0.3809, "grad_norm": 1.451090931892395, "learning_rate": 0.0002, "epoch": 5.528188286808977, "step": 5050}, {"loss": 0.3645, "grad_norm": 1.4688365459442139, "learning_rate": 0.0002, "epoch": 5.539135194307608, "step": 5060}, {"loss": 0.4431, "grad_norm": 1.1625734567642212, "learning_rate": 0.0002, "epoch": 5.55008210180624, "step": 5070}, {"loss": 0.3972, "grad_norm": 0.9332265257835388, "learning_rate": 0.0002, "epoch": 5.561029009304871, "step": 5080}, {"loss": 0.4, "grad_norm": 1.5635273456573486, "learning_rate": 0.0002, "epoch": 5.571975916803503, "step": 5090}, {"loss": 0.3651, "grad_norm": 1.3420509099960327, "learning_rate": 0.0002, "epoch": 5.582922824302135, "step": 5100}, {"loss": 0.3717, "grad_norm": 1.5826557874679565, "learning_rate": 0.0002, "epoch": 5.593869731800766, "step": 5110}, {"loss": 0.4256, "grad_norm": 1.5737065076828003, "learning_rate": 0.0002, "epoch": 5.604816639299398, "step": 5120}, {"loss": 0.39, "grad_norm": 1.3812499046325684, "learning_rate": 0.0002, "epoch": 5.615763546798029, "step": 5130}, {"loss": 0.3891, "grad_norm": 1.362833023071289, "learning_rate": 0.0002, "epoch": 5.626710454296661, "step": 5140}, {"loss": 0.455, "grad_norm": 1.7667874097824097, "learning_rate": 0.0002, "epoch": 5.637657361795293, "step": 5150}, {"loss": 0.4264, "grad_norm": 1.2661789655685425, "learning_rate": 0.0002, "epoch": 5.648604269293925, "step": 5160}, {"loss": 0.3261, "grad_norm": 1.2076870203018188, "learning_rate": 0.0002, "epoch": 5.659551176792556, "step": 5170}, {"loss": 0.372, "grad_norm": 1.2431524991989136, "learning_rate": 0.0002, "epoch": 5.670498084291188, "step": 5180}, {"loss": 0.4092, "grad_norm": 1.2216639518737793, "learning_rate": 0.0002, "epoch": 5.681444991789819, "step": 5190}, {"loss": 0.4171, "grad_norm": 0.9259352684020996, "learning_rate": 0.0002, "epoch": 5.692391899288451, "step": 5200}, {"loss": 0.3875, "grad_norm": 1.7929338216781616, "learning_rate": 0.0002, "epoch": 5.703338806787083, "step": 5210}, {"loss": 0.4424, "grad_norm": 1.4048460721969604, "learning_rate": 0.0002, "epoch": 5.714285714285714, "step": 5220}, {"loss": 0.3758, "grad_norm": 1.306874394416809, "learning_rate": 0.0002, "epoch": 5.725232621784346, "step": 5230}, {"loss": 0.3889, "grad_norm": 1.3137940168380737, "learning_rate": 0.0002, "epoch": 5.736179529282977, "step": 5240}, {"loss": 0.4804, "grad_norm": 1.1376476287841797, "learning_rate": 0.0002, "epoch": 5.747126436781609, "step": 5250}, {"loss": 0.377, "grad_norm": 1.450939416885376, "learning_rate": 0.0002, "epoch": 5.758073344280241, "step": 5260}, {"loss": 0.4732, "grad_norm": 0.983195960521698, "learning_rate": 0.0002, "epoch": 5.769020251778873, "step": 5270}, {"loss": 0.4041, "grad_norm": 1.66558837890625, "learning_rate": 0.0002, "epoch": 5.779967159277504, "step": 5280}, {"loss": 0.3643, "grad_norm": 0.9789204597473145, "learning_rate": 0.0002, "epoch": 5.790914066776136, "step": 5290}, {"loss": 0.3776, "grad_norm": 1.2110556364059448, "learning_rate": 0.0002, "epoch": 5.801860974274767, "step": 5300}, {"loss": 0.4049, "grad_norm": 1.3799304962158203, "learning_rate": 0.0002, "epoch": 5.812807881773399, "step": 5310}, {"loss": 0.4362, "grad_norm": 1.0570626258850098, "learning_rate": 0.0002, "epoch": 5.823754789272031, "step": 5320}, {"loss": 0.4716, "grad_norm": 1.4654436111450195, "learning_rate": 0.0002, "epoch": 5.834701696770662, "step": 5330}, {"loss": 0.4048, "grad_norm": 1.5216940641403198, "learning_rate": 0.0002, "epoch": 5.845648604269294, "step": 5340}, {"loss": 0.3848, "grad_norm": 1.018646001815796, "learning_rate": 0.0002, "epoch": 5.856595511767925, "step": 5350}, {"loss": 0.3705, "grad_norm": 1.028951644897461, "learning_rate": 0.0002, "epoch": 5.867542419266557, "step": 5360}, {"loss": 0.4213, "grad_norm": 2.571263313293457, "learning_rate": 0.0002, "epoch": 5.878489326765189, "step": 5370}, {"loss": 0.3647, "grad_norm": 1.3323984146118164, "learning_rate": 0.0002, "epoch": 5.889436234263821, "step": 5380}, {"loss": 0.4085, "grad_norm": 1.4317777156829834, "learning_rate": 0.0002, "epoch": 5.900383141762452, "step": 5390}, {"loss": 0.4254, "grad_norm": 1.4289140701293945, "learning_rate": 0.0002, "epoch": 5.911330049261084, "step": 5400}, {"loss": 0.3993, "grad_norm": 1.3130780458450317, "learning_rate": 0.0002, "epoch": 5.922276956759715, "step": 5410}, {"loss": 0.4025, "grad_norm": 1.3979902267456055, "learning_rate": 0.0002, "epoch": 5.933223864258347, "step": 5420}, {"loss": 0.3997, "grad_norm": 1.1827352046966553, "learning_rate": 0.0002, "epoch": 5.944170771756979, "step": 5430}, {"loss": 0.4163, "grad_norm": 1.1672080755233765, "learning_rate": 0.0002, "epoch": 5.95511767925561, "step": 5440}, {"loss": 0.4425, "grad_norm": 1.0949620008468628, "learning_rate": 0.0002, "epoch": 5.966064586754242, "step": 5450}, {"loss": 0.4219, "grad_norm": 1.3183925151824951, "learning_rate": 0.0002, "epoch": 5.977011494252873, "step": 5460}, {"loss": 0.4171, "grad_norm": 1.096198320388794, "learning_rate": 0.0002, "epoch": 5.987958401751505, "step": 5470}, {"loss": 0.3886, "grad_norm": 1.2601423263549805, "learning_rate": 0.0002, "epoch": 5.998905309250137, "step": 5480}, {"eval_loss": 1.611358880996704, "eval_runtime": 46.0638, "eval_samples_per_second": 9.465, "eval_steps_per_second": 1.194, "epoch": 6.0, "step": 5481}, {"loss": 0.2616, "grad_norm": 0.9854364991188049, "learning_rate": 0.0002, "epoch": 6.009852216748769, "step": 5490}, {"loss": 0.2412, "grad_norm": 1.8073689937591553, "learning_rate": 0.0002, "epoch": 6.0207991242474, "step": 5500}, {"loss": 0.2317, "grad_norm": 1.1852164268493652, "learning_rate": 0.0002, "epoch": 6.031746031746032, "step": 5510}, {"loss": 0.224, "grad_norm": 1.0937914848327637, "learning_rate": 0.0002, "epoch": 6.042692939244663, "step": 5520}, {"loss": 0.2473, "grad_norm": 0.7411194443702698, "learning_rate": 0.0002, "epoch": 6.053639846743295, "step": 5530}, {"loss": 0.2846, "grad_norm": 1.552127480506897, "learning_rate": 0.0002, "epoch": 6.064586754241927, "step": 5540}, {"loss": 0.2639, "grad_norm": 1.0465604066848755, "learning_rate": 0.0002, "epoch": 6.075533661740558, "step": 5550}, {"loss": 0.2696, "grad_norm": 1.4008121490478516, "learning_rate": 0.0002, "epoch": 6.08648056923919, "step": 5560}, {"loss": 0.3049, "grad_norm": 1.7049046754837036, "learning_rate": 0.0002, "epoch": 6.097427476737821, "step": 5570}, {"loss": 0.263, "grad_norm": 1.111151933670044, "learning_rate": 0.0002, "epoch": 6.108374384236453, "step": 5580}, {"loss": 0.2816, "grad_norm": 1.4271087646484375, "learning_rate": 0.0002, "epoch": 6.119321291735085, "step": 5590}, {"loss": 0.2878, "grad_norm": 1.3917373418807983, "learning_rate": 0.0002, "epoch": 6.130268199233717, "step": 5600}, {"loss": 0.2482, "grad_norm": 1.013689637184143, "learning_rate": 0.0002, "epoch": 6.141215106732348, "step": 5610}, {"loss": 0.2841, "grad_norm": 1.342645525932312, "learning_rate": 0.0002, "epoch": 6.15216201423098, "step": 5620}, {"loss": 0.2335, "grad_norm": 1.4480562210083008, "learning_rate": 0.0002, "epoch": 6.163108921729611, "step": 5630}, {"loss": 0.2696, "grad_norm": 1.2483175992965698, "learning_rate": 0.0002, "epoch": 6.174055829228243, "step": 5640}, {"loss": 0.2656, "grad_norm": 1.2944550514221191, "learning_rate": 0.0002, "epoch": 6.185002736726875, "step": 5650}, {"loss": 0.2704, "grad_norm": 1.264142632484436, "learning_rate": 0.0002, "epoch": 6.195949644225506, "step": 5660}, {"loss": 0.2971, "grad_norm": 1.2068781852722168, "learning_rate": 0.0002, "epoch": 6.206896551724138, "step": 5670}, {"loss": 0.2882, "grad_norm": 1.0401629209518433, "learning_rate": 0.0002, "epoch": 6.217843459222769, "step": 5680}, {"loss": 0.3022, "grad_norm": 1.2054402828216553, "learning_rate": 0.0002, "epoch": 6.228790366721401, "step": 5690}, {"loss": 0.2949, "grad_norm": 1.1278687715530396, "learning_rate": 0.0002, "epoch": 6.239737274220033, "step": 5700}, {"loss": 0.2477, "grad_norm": 1.24592125415802, "learning_rate": 0.0002, "epoch": 6.250684181718665, "step": 5710}, {"loss": 0.246, "grad_norm": 1.2686697244644165, "learning_rate": 0.0002, "epoch": 6.261631089217296, "step": 5720}, {"loss": 0.2974, "grad_norm": 1.1836518049240112, "learning_rate": 0.0002, "epoch": 6.272577996715928, "step": 5730}, {"loss": 0.2963, "grad_norm": 1.387752890586853, "learning_rate": 0.0002, "epoch": 6.283524904214559, "step": 5740}, {"loss": 0.2961, "grad_norm": 1.9390363693237305, "learning_rate": 0.0002, "epoch": 6.294471811713191, "step": 5750}, {"loss": 0.2765, "grad_norm": 1.2919824123382568, "learning_rate": 0.0002, "epoch": 6.305418719211823, "step": 5760}, {"loss": 0.2898, "grad_norm": 1.2793965339660645, "learning_rate": 0.0002, "epoch": 6.316365626710454, "step": 5770}, {"loss": 0.2786, "grad_norm": 1.5486980676651, "learning_rate": 0.0002, "epoch": 6.327312534209086, "step": 5780}, {"loss": 0.2684, "grad_norm": 1.2757408618927002, "learning_rate": 0.0002, "epoch": 6.338259441707717, "step": 5790}, {"loss": 0.2841, "grad_norm": 1.3245713710784912, "learning_rate": 0.0002, "epoch": 6.349206349206349, "step": 5800}, {"loss": 0.3096, "grad_norm": 1.6262527704238892, "learning_rate": 0.0002, "epoch": 6.360153256704981, "step": 5810}, {"loss": 0.3219, "grad_norm": 1.465224027633667, "learning_rate": 0.0002, "epoch": 6.371100164203613, "step": 5820}, {"loss": 0.2703, "grad_norm": 1.437408447265625, "learning_rate": 0.0002, "epoch": 6.382047071702244, "step": 5830}, {"loss": 0.3012, "grad_norm": 1.3094626665115356, "learning_rate": 0.0002, "epoch": 6.392993979200876, "step": 5840}, {"loss": 0.2991, "grad_norm": 1.6717544794082642, "learning_rate": 0.0002, "epoch": 6.403940886699507, "step": 5850}, {"loss": 0.2892, "grad_norm": 1.1023344993591309, "learning_rate": 0.0002, "epoch": 6.414887794198139, "step": 5860}, {"loss": 0.3078, "grad_norm": 1.2397106885910034, "learning_rate": 0.0002, "epoch": 6.425834701696771, "step": 5870}, {"loss": 0.2984, "grad_norm": 1.6139185428619385, "learning_rate": 0.0002, "epoch": 6.436781609195402, "step": 5880}, {"loss": 0.2353, "grad_norm": 1.3164576292037964, "learning_rate": 0.0002, "epoch": 6.447728516694034, "step": 5890}, {"loss": 0.2772, "grad_norm": 1.3317217826843262, "learning_rate": 0.0002, "epoch": 6.458675424192665, "step": 5900}, {"loss": 0.2555, "grad_norm": 1.215008020401001, "learning_rate": 0.0002, "epoch": 6.469622331691297, "step": 5910}, {"loss": 0.2715, "grad_norm": 1.625672698020935, "learning_rate": 0.0002, "epoch": 6.480569239189929, "step": 5920}, {"loss": 0.2938, "grad_norm": 1.1262489557266235, "learning_rate": 0.0002, "epoch": 6.491516146688561, "step": 5930}, {"loss": 0.2921, "grad_norm": 1.447100281715393, "learning_rate": 0.0002, "epoch": 6.502463054187192, "step": 5940}, {"loss": 0.3059, "grad_norm": 1.3306448459625244, "learning_rate": 0.0002, "epoch": 6.513409961685824, "step": 5950}, {"loss": 0.2922, "grad_norm": 1.307732105255127, "learning_rate": 0.0002, "epoch": 6.524356869184455, "step": 5960}, {"loss": 0.2891, "grad_norm": 1.1851097345352173, "learning_rate": 0.0002, "epoch": 6.535303776683087, "step": 5970}, {"loss": 0.2859, "grad_norm": 1.462816596031189, "learning_rate": 0.0002, "epoch": 6.546250684181719, "step": 5980}, {"loss": 0.2698, "grad_norm": 1.2324728965759277, "learning_rate": 0.0002, "epoch": 6.55719759168035, "step": 5990}, {"loss": 0.2672, "grad_norm": 1.3627429008483887, "learning_rate": 0.0002, "epoch": 6.568144499178982, "step": 6000}, {"loss": 0.3182, "grad_norm": 1.94977867603302, "learning_rate": 0.0002, "epoch": 6.579091406677613, "step": 6010}, {"loss": 0.3183, "grad_norm": 1.459844946861267, "learning_rate": 0.0002, "epoch": 6.590038314176245, "step": 6020}, {"loss": 0.3142, "grad_norm": 1.4454325437545776, "learning_rate": 0.0002, "epoch": 6.600985221674877, "step": 6030}, {"loss": 0.269, "grad_norm": 1.4245165586471558, "learning_rate": 0.0002, "epoch": 6.611932129173509, "step": 6040}, {"loss": 0.3041, "grad_norm": 1.195803165435791, "learning_rate": 0.0002, "epoch": 6.62287903667214, "step": 6050}, {"loss": 0.3075, "grad_norm": 1.3589898347854614, "learning_rate": 0.0002, "epoch": 6.633825944170772, "step": 6060}, {"loss": 0.3291, "grad_norm": 1.3488036394119263, "learning_rate": 0.0002, "epoch": 6.644772851669403, "step": 6070}, {"loss": 0.2898, "grad_norm": 1.0954102277755737, "learning_rate": 0.0002, "epoch": 6.655719759168035, "step": 6080}, {"loss": 0.3489, "grad_norm": 1.4431062936782837, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 6090}, {"loss": 0.2816, "grad_norm": 1.4387465715408325, "learning_rate": 0.0002, "epoch": 6.677613574165298, "step": 6100}, {"loss": 0.2705, "grad_norm": 1.8398990631103516, "learning_rate": 0.0002, "epoch": 6.68856048166393, "step": 6110}, {"loss": 0.3214, "grad_norm": 1.3523273468017578, "learning_rate": 0.0002, "epoch": 6.699507389162561, "step": 6120}, {"loss": 0.287, "grad_norm": 1.6326191425323486, "learning_rate": 0.0002, "epoch": 6.710454296661193, "step": 6130}, {"loss": 0.2857, "grad_norm": 1.3677960634231567, "learning_rate": 0.0002, "epoch": 6.721401204159825, "step": 6140}, {"loss": 0.3264, "grad_norm": 1.1993201971054077, "learning_rate": 0.0002, "epoch": 6.732348111658457, "step": 6150}, {"loss": 0.3071, "grad_norm": 1.1864078044891357, "learning_rate": 0.0002, "epoch": 6.743295019157088, "step": 6160}, {"loss": 0.3087, "grad_norm": 1.1625522375106812, "learning_rate": 0.0002, "epoch": 6.75424192665572, "step": 6170}, {"loss": 0.3551, "grad_norm": 1.5803234577178955, "learning_rate": 0.0002, "epoch": 6.765188834154351, "step": 6180}, {"loss": 0.3059, "grad_norm": 1.151746153831482, "learning_rate": 0.0002, "epoch": 6.776135741652983, "step": 6190}, {"loss": 0.2697, "grad_norm": 1.0727161169052124, "learning_rate": 0.0002, "epoch": 6.787082649151615, "step": 6200}, {"loss": 0.2844, "grad_norm": 1.4148162603378296, "learning_rate": 0.0002, "epoch": 6.798029556650246, "step": 6210}, {"loss": 0.3417, "grad_norm": 1.2071447372436523, "learning_rate": 0.0002, "epoch": 6.808976464148878, "step": 6220}, {"loss": 0.3066, "grad_norm": 1.3843804597854614, "learning_rate": 0.0002, "epoch": 6.819923371647509, "step": 6230}, {"loss": 0.2769, "grad_norm": 1.2490662336349487, "learning_rate": 0.0002, "epoch": 6.830870279146141, "step": 6240}, {"loss": 0.3237, "grad_norm": 1.6029689311981201, "learning_rate": 0.0002, "epoch": 6.841817186644773, "step": 6250}, {"loss": 0.3152, "grad_norm": 1.0388455390930176, "learning_rate": 0.0002, "epoch": 6.852764094143405, "step": 6260}, {"loss": 0.3026, "grad_norm": 1.3883857727050781, "learning_rate": 0.0002, "epoch": 6.863711001642036, "step": 6270}, {"loss": 0.3175, "grad_norm": 1.0500187873840332, "learning_rate": 0.0002, "epoch": 6.874657909140668, "step": 6280}, {"loss": 0.2952, "grad_norm": 1.4243487119674683, "learning_rate": 0.0002, "epoch": 6.885604816639299, "step": 6290}, {"loss": 0.2679, "grad_norm": 1.3169665336608887, "learning_rate": 0.0002, "epoch": 6.896551724137931, "step": 6300}, {"loss": 0.3291, "grad_norm": 1.5261493921279907, "learning_rate": 0.0002, "epoch": 6.907498631636563, "step": 6310}, {"loss": 0.3344, "grad_norm": 1.578403115272522, "learning_rate": 0.0002, "epoch": 6.9184455391351944, "step": 6320}, {"loss": 0.3263, "grad_norm": 1.4093263149261475, "learning_rate": 0.0002, "epoch": 6.929392446633826, "step": 6330}, {"loss": 0.3396, "grad_norm": 1.4003552198410034, "learning_rate": 0.0002, "epoch": 6.940339354132457, "step": 6340}, {"loss": 0.3476, "grad_norm": 1.650190830230713, "learning_rate": 0.0002, "epoch": 6.951286261631089, "step": 6350}, {"loss": 0.3442, "grad_norm": 1.2314515113830566, "learning_rate": 0.0002, "epoch": 6.962233169129721, "step": 6360}, {"loss": 0.3341, "grad_norm": 1.270980954170227, "learning_rate": 0.0002, "epoch": 6.973180076628353, "step": 6370}, {"loss": 0.3425, "grad_norm": 1.6352545022964478, "learning_rate": 0.0002, "epoch": 6.984126984126984, "step": 6380}, {"loss": 0.3647, "grad_norm": 1.3744925260543823, "learning_rate": 0.0002, "epoch": 6.995073891625616, "step": 6390}]} +{"epoch": 7.995621237000547, "step": 7304, "epoch_duration": 1353.653870344162, "total_accumulated_duration": 10884.245246648788, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-4/checkpoint-1827", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4039, "grad_norm": 0.7611560821533203, "learning_rate": 0.0002, "epoch": 0.010946907498631636, "step": 10}, {"loss": 1.8818, "grad_norm": 0.4633193612098694, "learning_rate": 0.0002, "epoch": 0.021893814997263273, "step": 20}, {"loss": 1.5927, "grad_norm": 0.49326154589653015, "learning_rate": 0.0002, "epoch": 0.03284072249589491, "step": 30}, {"loss": 1.3859, "grad_norm": 0.3943138122558594, "learning_rate": 0.0002, "epoch": 0.043787629994526546, "step": 40}, {"loss": 1.3036, "grad_norm": 0.43292930722236633, "learning_rate": 0.0002, "epoch": 0.05473453749315818, "step": 50}, {"loss": 1.2427, "grad_norm": 0.3431817591190338, "learning_rate": 0.0002, "epoch": 0.06568144499178982, "step": 60}, {"loss": 1.3076, "grad_norm": 0.38011446595191956, "learning_rate": 0.0002, "epoch": 0.07662835249042145, "step": 70}, {"loss": 1.1641, "grad_norm": 0.7441071271896362, "learning_rate": 0.0002, "epoch": 0.08757525998905309, "step": 80}, {"loss": 1.1721, "grad_norm": 0.3359833061695099, "learning_rate": 0.0002, "epoch": 0.09852216748768473, "step": 90}, {"loss": 1.2774, "grad_norm": 0.3724392354488373, "learning_rate": 0.0002, "epoch": 0.10946907498631636, "step": 100}, {"loss": 1.216, "grad_norm": 0.40673762559890747, "learning_rate": 0.0002, "epoch": 0.120415982484948, "step": 110}, {"loss": 1.2611, "grad_norm": 0.40036800503730774, "learning_rate": 0.0002, "epoch": 0.13136288998357964, "step": 120}, {"loss": 1.2436, "grad_norm": 2.844191312789917, "learning_rate": 0.0002, "epoch": 0.1423097974822113, "step": 130}, {"loss": 1.2254, "grad_norm": 0.3104734420776367, "learning_rate": 0.0002, "epoch": 0.1532567049808429, "step": 140}, {"loss": 1.0498, "grad_norm": 0.3266797959804535, "learning_rate": 0.0002, "epoch": 0.16420361247947454, "step": 150}, {"loss": 1.1644, "grad_norm": 0.3079199194908142, "learning_rate": 0.0002, "epoch": 0.17515051997810618, "step": 160}, {"loss": 1.2923, "grad_norm": 0.3872479498386383, "learning_rate": 0.0002, "epoch": 0.18609742747673783, "step": 170}, {"loss": 1.1809, "grad_norm": 0.38654500246047974, "learning_rate": 0.0002, "epoch": 0.19704433497536947, "step": 180}, {"loss": 1.066, "grad_norm": 0.2913552522659302, "learning_rate": 0.0002, "epoch": 0.20799124247400108, "step": 190}, {"loss": 1.0868, "grad_norm": 0.2960572838783264, "learning_rate": 0.0002, "epoch": 0.21893814997263272, "step": 200}, {"loss": 1.136, "grad_norm": 0.5175501108169556, "learning_rate": 0.0002, "epoch": 0.22988505747126436, "step": 210}, {"loss": 1.1556, "grad_norm": 1.2921574115753174, "learning_rate": 0.0002, "epoch": 0.240831964969896, "step": 220}, {"loss": 1.1488, "grad_norm": 0.2675512135028839, "learning_rate": 0.0002, "epoch": 0.25177887246852765, "step": 230}, {"loss": 1.2764, "grad_norm": 0.3956190049648285, "learning_rate": 0.0002, "epoch": 0.2627257799671593, "step": 240}, {"loss": 1.1889, "grad_norm": 0.6022581458091736, "learning_rate": 0.0002, "epoch": 0.27367268746579093, "step": 250}, {"loss": 1.1981, "grad_norm": 1.1949563026428223, "learning_rate": 0.0002, "epoch": 0.2846195949644226, "step": 260}, {"loss": 1.1877, "grad_norm": 0.31173548102378845, "learning_rate": 0.0002, "epoch": 0.2955665024630542, "step": 270}, {"loss": 1.06, "grad_norm": 0.2808472812175751, "learning_rate": 0.0002, "epoch": 0.3065134099616858, "step": 280}, {"loss": 1.1752, "grad_norm": 0.3042023777961731, "learning_rate": 0.0002, "epoch": 0.31746031746031744, "step": 290}, {"loss": 1.3147, "grad_norm": 0.39915043115615845, "learning_rate": 0.0002, "epoch": 0.3284072249589491, "step": 300}, {"loss": 1.2425, "grad_norm": 0.39118197560310364, "learning_rate": 0.0002, "epoch": 0.3393541324575807, "step": 310}, {"loss": 1.1363, "grad_norm": 0.355010986328125, "learning_rate": 0.0002, "epoch": 0.35030103995621237, "step": 320}, {"loss": 1.1925, "grad_norm": 0.29734086990356445, "learning_rate": 0.0002, "epoch": 0.361247947454844, "step": 330}, {"loss": 1.1974, "grad_norm": 0.346096009016037, "learning_rate": 0.0002, "epoch": 0.37219485495347565, "step": 340}, {"loss": 1.1641, "grad_norm": 0.4829643666744232, "learning_rate": 0.0002, "epoch": 0.3831417624521073, "step": 350}, {"loss": 1.2808, "grad_norm": 0.4726872742176056, "learning_rate": 0.0002, "epoch": 0.39408866995073893, "step": 360}, {"loss": 1.1532, "grad_norm": 0.3130153715610504, "learning_rate": 0.0002, "epoch": 0.4050355774493706, "step": 370}, {"loss": 1.1842, "grad_norm": 0.5123590230941772, "learning_rate": 0.0002, "epoch": 0.41598248494800216, "step": 380}, {"loss": 1.1539, "grad_norm": 0.3444574773311615, "learning_rate": 0.0002, "epoch": 0.4269293924466338, "step": 390}, {"loss": 1.1756, "grad_norm": 0.5302175283432007, "learning_rate": 0.0002, "epoch": 0.43787629994526545, "step": 400}, {"loss": 1.1138, "grad_norm": 0.2713572680950165, "learning_rate": 0.0002, "epoch": 0.4488232074438971, "step": 410}, {"loss": 1.1281, "grad_norm": 0.29530611634254456, "learning_rate": 0.0002, "epoch": 0.45977011494252873, "step": 420}, {"loss": 1.1721, "grad_norm": 0.27282455563545227, "learning_rate": 0.0002, "epoch": 0.47071702244116037, "step": 430}, {"loss": 1.1213, "grad_norm": 0.2647949755191803, "learning_rate": 0.0002, "epoch": 0.481663929939792, "step": 440}, {"loss": 1.1656, "grad_norm": 0.35509347915649414, "learning_rate": 0.0002, "epoch": 0.49261083743842365, "step": 450}, {"loss": 1.1251, "grad_norm": 0.1959609091281891, "learning_rate": 0.0002, "epoch": 0.5035577449370553, "step": 460}, {"loss": 1.1889, "grad_norm": 0.40090155601501465, "learning_rate": 0.0002, "epoch": 0.5145046524356869, "step": 470}, {"loss": 1.156, "grad_norm": 0.3354604244232178, "learning_rate": 0.0002, "epoch": 0.5254515599343186, "step": 480}, {"loss": 1.1024, "grad_norm": 0.2758506238460541, "learning_rate": 0.0002, "epoch": 0.5363984674329502, "step": 490}, {"loss": 1.1108, "grad_norm": 0.3629051744937897, "learning_rate": 0.0002, "epoch": 0.5473453749315819, "step": 500}, {"loss": 1.2236, "grad_norm": 0.30802229046821594, "learning_rate": 0.0002, "epoch": 0.5582922824302134, "step": 510}, {"loss": 1.0424, "grad_norm": 0.3099463880062103, "learning_rate": 0.0002, "epoch": 0.5692391899288451, "step": 520}, {"loss": 1.255, "grad_norm": 0.42299067974090576, "learning_rate": 0.0002, "epoch": 0.5801860974274767, "step": 530}, {"loss": 1.1698, "grad_norm": 0.5392252802848816, "learning_rate": 0.0002, "epoch": 0.5911330049261084, "step": 540}, {"loss": 1.1171, "grad_norm": 0.34768250584602356, "learning_rate": 0.0002, "epoch": 0.60207991242474, "step": 550}, {"loss": 1.102, "grad_norm": 0.28490015864372253, "learning_rate": 0.0002, "epoch": 0.6130268199233716, "step": 560}, {"loss": 1.1111, "grad_norm": 0.34787994623184204, "learning_rate": 0.0002, "epoch": 0.6239737274220033, "step": 570}, {"loss": 1.0759, "grad_norm": 0.29058772325515747, "learning_rate": 0.0002, "epoch": 0.6349206349206349, "step": 580}, {"loss": 1.1157, "grad_norm": 0.4063778817653656, "learning_rate": 0.0002, "epoch": 0.6458675424192666, "step": 590}, {"loss": 1.1432, "grad_norm": 0.9244267344474792, "learning_rate": 0.0002, "epoch": 0.6568144499178982, "step": 600}, {"loss": 1.0591, "grad_norm": 0.27605190873146057, "learning_rate": 0.0002, "epoch": 0.6677613574165299, "step": 610}, {"loss": 1.2123, "grad_norm": 0.34346821904182434, "learning_rate": 0.0002, "epoch": 0.6787082649151615, "step": 620}, {"loss": 1.2195, "grad_norm": 0.3093279302120209, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 630}, {"loss": 1.2461, "grad_norm": 1.0069009065628052, "learning_rate": 0.0002, "epoch": 0.7006020799124247, "step": 640}, {"loss": 1.0892, "grad_norm": 0.5049130916595459, "learning_rate": 0.0002, "epoch": 0.7115489874110563, "step": 650}, {"loss": 1.1404, "grad_norm": 0.3748924732208252, "learning_rate": 0.0002, "epoch": 0.722495894909688, "step": 660}, {"loss": 1.1062, "grad_norm": 0.2964959144592285, "learning_rate": 0.0002, "epoch": 0.7334428024083196, "step": 670}, {"loss": 1.2617, "grad_norm": 0.4599986970424652, "learning_rate": 0.0002, "epoch": 0.7443897099069513, "step": 680}, {"loss": 1.088, "grad_norm": 0.27292951941490173, "learning_rate": 0.0002, "epoch": 0.7553366174055829, "step": 690}, {"loss": 1.2047, "grad_norm": 0.3123566806316376, "learning_rate": 0.0002, "epoch": 0.7662835249042146, "step": 700}, {"loss": 1.0021, "grad_norm": 0.28310710191726685, "learning_rate": 0.0002, "epoch": 0.7772304324028462, "step": 710}, {"loss": 1.2281, "grad_norm": 0.3279992341995239, "learning_rate": 0.0002, "epoch": 0.7881773399014779, "step": 720}, {"loss": 1.25, "grad_norm": 0.28179168701171875, "learning_rate": 0.0002, "epoch": 0.7991242474001095, "step": 730}, {"loss": 1.0602, "grad_norm": 0.31492987275123596, "learning_rate": 0.0002, "epoch": 0.8100711548987412, "step": 740}, {"loss": 1.2518, "grad_norm": 0.41821011900901794, "learning_rate": 0.0002, "epoch": 0.8210180623973727, "step": 750}, {"loss": 1.1612, "grad_norm": 0.325235515832901, "learning_rate": 0.0002, "epoch": 0.8319649698960043, "step": 760}, {"loss": 1.27, "grad_norm": 0.5366070866584778, "learning_rate": 0.0002, "epoch": 0.842911877394636, "step": 770}, {"loss": 1.0921, "grad_norm": 0.32570579648017883, "learning_rate": 0.0002, "epoch": 0.8538587848932676, "step": 780}, {"loss": 1.1032, "grad_norm": 0.3642968237400055, "learning_rate": 0.0002, "epoch": 0.8648056923918993, "step": 790}, {"loss": 1.1234, "grad_norm": 0.29713448882102966, "learning_rate": 0.0002, "epoch": 0.8757525998905309, "step": 800}, {"loss": 1.0978, "grad_norm": 0.23599444329738617, "learning_rate": 0.0002, "epoch": 0.8866995073891626, "step": 810}, {"loss": 1.1867, "grad_norm": 0.31522464752197266, "learning_rate": 0.0002, "epoch": 0.8976464148877942, "step": 820}, {"loss": 1.0208, "grad_norm": 0.32754790782928467, "learning_rate": 0.0002, "epoch": 0.9085933223864259, "step": 830}, {"loss": 0.9786, "grad_norm": 0.22741089761257172, "learning_rate": 0.0002, "epoch": 0.9195402298850575, "step": 840}, {"loss": 1.0689, "grad_norm": 0.3089679777622223, "learning_rate": 0.0002, "epoch": 0.9304871373836892, "step": 850}, {"loss": 1.0354, "grad_norm": 0.27440521121025085, "learning_rate": 0.0002, "epoch": 0.9414340448823207, "step": 860}, {"loss": 1.0417, "grad_norm": 0.3498363792896271, "learning_rate": 0.0002, "epoch": 0.9523809523809523, "step": 870}, {"loss": 1.269, "grad_norm": 0.47151854634284973, "learning_rate": 0.0002, "epoch": 0.963327859879584, "step": 880}, {"loss": 1.1174, "grad_norm": 0.24756591022014618, "learning_rate": 0.0002, "epoch": 0.9742747673782156, "step": 890}, {"loss": 1.0622, "grad_norm": 0.2600938677787781, "learning_rate": 0.0002, "epoch": 0.9852216748768473, "step": 900}, {"loss": 1.0968, "grad_norm": 0.2934586703777313, "learning_rate": 0.0002, "epoch": 0.9961685823754789, "step": 910}, {"eval_loss": 1.158464789390564, "eval_runtime": 46.0774, "eval_samples_per_second": 9.462, "eval_steps_per_second": 1.194, "epoch": 0.9994526546250684, "step": 913}, {"loss": 1.0322, "grad_norm": 0.37776654958724976, "learning_rate": 0.0002, "epoch": 1.0071154898741106, "step": 920}, {"loss": 1.1346, "grad_norm": 0.31784629821777344, "learning_rate": 0.0002, "epoch": 1.0180623973727423, "step": 930}, {"loss": 0.8976, "grad_norm": 0.24244336783885956, "learning_rate": 0.0002, "epoch": 1.0290093048713738, "step": 940}, {"loss": 1.0172, "grad_norm": 0.3185454308986664, "learning_rate": 0.0002, "epoch": 1.0399562123700055, "step": 950}, {"loss": 1.1645, "grad_norm": 0.3589441478252411, "learning_rate": 0.0002, "epoch": 1.0509031198686372, "step": 960}, {"loss": 1.0317, "grad_norm": 0.38593578338623047, "learning_rate": 0.0002, "epoch": 1.0618500273672686, "step": 970}, {"loss": 0.9913, "grad_norm": 0.39694955945014954, "learning_rate": 0.0002, "epoch": 1.0727969348659003, "step": 980}, {"loss": 1.0893, "grad_norm": 0.469817191362381, "learning_rate": 0.0002, "epoch": 1.083743842364532, "step": 990}, {"loss": 0.983, "grad_norm": 0.2634755074977875, "learning_rate": 0.0002, "epoch": 1.0946907498631637, "step": 1000}, {"loss": 1.0144, "grad_norm": 0.43189436197280884, "learning_rate": 0.0002, "epoch": 1.1056376573617952, "step": 1010}, {"loss": 0.9663, "grad_norm": 0.5559977889060974, "learning_rate": 0.0002, "epoch": 1.116584564860427, "step": 1020}, {"loss": 1.0481, "grad_norm": 0.32100191712379456, "learning_rate": 0.0002, "epoch": 1.1275314723590586, "step": 1030}, {"loss": 1.1012, "grad_norm": 0.40179768204689026, "learning_rate": 0.0002, "epoch": 1.1384783798576903, "step": 1040}, {"loss": 1.0029, "grad_norm": 0.3659493029117584, "learning_rate": 0.0002, "epoch": 1.1494252873563218, "step": 1050}, {"loss": 0.9597, "grad_norm": 0.701704204082489, "learning_rate": 0.0002, "epoch": 1.1603721948549535, "step": 1060}, {"loss": 1.0204, "grad_norm": 0.3650563359260559, "learning_rate": 0.0002, "epoch": 1.1713191023535852, "step": 1070}, {"loss": 0.907, "grad_norm": 0.3191976249217987, "learning_rate": 0.0002, "epoch": 1.1822660098522166, "step": 1080}, {"loss": 1.0648, "grad_norm": 0.3615441918373108, "learning_rate": 0.0002, "epoch": 1.1932129173508483, "step": 1090}, {"loss": 1.0067, "grad_norm": 0.39474231004714966, "learning_rate": 0.0002, "epoch": 1.20415982484948, "step": 1100}, {"loss": 0.9852, "grad_norm": 0.3752822279930115, "learning_rate": 0.0002, "epoch": 1.2151067323481117, "step": 1110}, {"loss": 1.0067, "grad_norm": 0.4165991246700287, "learning_rate": 0.0002, "epoch": 1.2260536398467432, "step": 1120}, {"loss": 1.0244, "grad_norm": 0.5326506495475769, "learning_rate": 0.0002, "epoch": 1.237000547345375, "step": 1130}, {"loss": 1.0542, "grad_norm": 0.48845794796943665, "learning_rate": 0.0002, "epoch": 1.2479474548440066, "step": 1140}, {"loss": 1.0885, "grad_norm": 0.29910150170326233, "learning_rate": 0.0002, "epoch": 1.2588943623426383, "step": 1150}, {"loss": 1.2233, "grad_norm": 0.5069725513458252, "learning_rate": 0.0002, "epoch": 1.2698412698412698, "step": 1160}, {"loss": 1.0992, "grad_norm": 0.29500406980514526, "learning_rate": 0.0002, "epoch": 1.2807881773399015, "step": 1170}, {"loss": 1.0291, "grad_norm": 0.4711538851261139, "learning_rate": 0.0002, "epoch": 1.2917350848385332, "step": 1180}, {"loss": 1.0513, "grad_norm": 0.4203340709209442, "learning_rate": 0.0002, "epoch": 1.3026819923371646, "step": 1190}, {"loss": 1.1274, "grad_norm": 0.36101874709129333, "learning_rate": 0.0002, "epoch": 1.3136288998357963, "step": 1200}, {"loss": 1.085, "grad_norm": 0.4608800411224365, "learning_rate": 0.0002, "epoch": 1.324575807334428, "step": 1210}, {"loss": 1.0695, "grad_norm": 0.6570906639099121, "learning_rate": 0.0002, "epoch": 1.3355227148330597, "step": 1220}, {"loss": 0.9786, "grad_norm": 0.5352164506912231, "learning_rate": 0.0002, "epoch": 1.3464696223316914, "step": 1230}, {"loss": 1.0513, "grad_norm": 0.3885001242160797, "learning_rate": 0.0002, "epoch": 1.357416529830323, "step": 1240}, {"loss": 1.0611, "grad_norm": 0.2987913489341736, "learning_rate": 0.0002, "epoch": 1.3683634373289546, "step": 1250}, {"loss": 1.109, "grad_norm": 0.42070427536964417, "learning_rate": 0.0002, "epoch": 1.3793103448275863, "step": 1260}, {"loss": 1.1092, "grad_norm": 0.5957782864570618, "learning_rate": 0.0002, "epoch": 1.3902572523262178, "step": 1270}, {"loss": 1.0554, "grad_norm": 0.32898882031440735, "learning_rate": 0.0002, "epoch": 1.4012041598248495, "step": 1280}, {"loss": 0.909, "grad_norm": 0.27624452114105225, "learning_rate": 0.0002, "epoch": 1.4121510673234812, "step": 1290}, {"loss": 1.024, "grad_norm": 0.49570828676223755, "learning_rate": 0.0002, "epoch": 1.4230979748221126, "step": 1300}, {"loss": 1.0471, "grad_norm": 0.26191383600234985, "learning_rate": 0.0002, "epoch": 1.4340448823207443, "step": 1310}, {"loss": 1.0788, "grad_norm": 0.35664042830467224, "learning_rate": 0.0002, "epoch": 1.444991789819376, "step": 1320}, {"loss": 1.0108, "grad_norm": 0.45126354694366455, "learning_rate": 0.0002, "epoch": 1.4559386973180077, "step": 1330}, {"loss": 0.9571, "grad_norm": 0.37318357825279236, "learning_rate": 0.0002, "epoch": 1.4668856048166394, "step": 1340}, {"loss": 1.0507, "grad_norm": 0.6428970098495483, "learning_rate": 0.0002, "epoch": 1.477832512315271, "step": 1350}, {"loss": 1.0367, "grad_norm": 0.43256187438964844, "learning_rate": 0.0002, "epoch": 1.4887794198139026, "step": 1360}, {"loss": 1.1321, "grad_norm": 0.5343793630599976, "learning_rate": 0.0002, "epoch": 1.4997263273125343, "step": 1370}, {"loss": 1.1054, "grad_norm": 0.315437376499176, "learning_rate": 0.0002, "epoch": 1.5106732348111658, "step": 1380}, {"loss": 0.8916, "grad_norm": 0.41561153531074524, "learning_rate": 0.0002, "epoch": 1.5216201423097975, "step": 1390}, {"loss": 1.0391, "grad_norm": 0.3201070725917816, "learning_rate": 0.0002, "epoch": 1.5325670498084292, "step": 1400}, {"loss": 1.14, "grad_norm": 0.505537211894989, "learning_rate": 0.0002, "epoch": 1.5435139573070606, "step": 1410}, {"loss": 1.0775, "grad_norm": 0.3747410178184509, "learning_rate": 0.0002, "epoch": 1.5544608648056923, "step": 1420}, {"loss": 1.1171, "grad_norm": 0.49385908246040344, "learning_rate": 0.0002, "epoch": 1.565407772304324, "step": 1430}, {"loss": 1.1182, "grad_norm": 0.49831628799438477, "learning_rate": 0.0002, "epoch": 1.5763546798029555, "step": 1440}, {"loss": 1.0079, "grad_norm": 0.372127890586853, "learning_rate": 0.0002, "epoch": 1.5873015873015874, "step": 1450}, {"loss": 0.9931, "grad_norm": 0.40070840716362, "learning_rate": 0.0002, "epoch": 1.598248494800219, "step": 1460}, {"loss": 0.8954, "grad_norm": 0.34907400608062744, "learning_rate": 0.0002, "epoch": 1.6091954022988506, "step": 1470}, {"loss": 0.9743, "grad_norm": 0.4632418751716614, "learning_rate": 0.0002, "epoch": 1.6201423097974823, "step": 1480}, {"loss": 1.0103, "grad_norm": 0.40164515376091003, "learning_rate": 0.0002, "epoch": 1.6310892172961138, "step": 1490}, {"loss": 0.9523, "grad_norm": 0.3214994966983795, "learning_rate": 0.0002, "epoch": 1.6420361247947455, "step": 1500}, {"loss": 1.0161, "grad_norm": 0.3727897107601166, "learning_rate": 0.0002, "epoch": 1.6529830322933772, "step": 1510}, {"loss": 1.0443, "grad_norm": 0.3817640542984009, "learning_rate": 0.0002, "epoch": 1.6639299397920086, "step": 1520}, {"loss": 1.0511, "grad_norm": 0.5592136979103088, "learning_rate": 0.0002, "epoch": 1.6748768472906403, "step": 1530}, {"loss": 0.9682, "grad_norm": 0.44636598229408264, "learning_rate": 0.0002, "epoch": 1.685823754789272, "step": 1540}, {"loss": 1.033, "grad_norm": 0.40441709756851196, "learning_rate": 0.0002, "epoch": 1.6967706622879035, "step": 1550}, {"loss": 1.0857, "grad_norm": 0.3243522644042969, "learning_rate": 0.0002, "epoch": 1.7077175697865354, "step": 1560}, {"loss": 0.9482, "grad_norm": 0.34277570247650146, "learning_rate": 0.0002, "epoch": 1.718664477285167, "step": 1570}, {"loss": 0.974, "grad_norm": 0.3279995024204254, "learning_rate": 0.0002, "epoch": 1.7296113847837986, "step": 1580}, {"loss": 0.9414, "grad_norm": 0.41968777775764465, "learning_rate": 0.0002, "epoch": 1.7405582922824303, "step": 1590}, {"loss": 0.9768, "grad_norm": 0.39464613795280457, "learning_rate": 0.0002, "epoch": 1.7515051997810618, "step": 1600}, {"loss": 1.0347, "grad_norm": 0.3839009404182434, "learning_rate": 0.0002, "epoch": 1.7624521072796935, "step": 1610}, {"loss": 0.9195, "grad_norm": 0.3250715434551239, "learning_rate": 0.0002, "epoch": 1.7733990147783252, "step": 1620}, {"loss": 1.0049, "grad_norm": 0.5166561007499695, "learning_rate": 0.0002, "epoch": 1.7843459222769567, "step": 1630}, {"loss": 1.0364, "grad_norm": 0.4115183353424072, "learning_rate": 0.0002, "epoch": 1.7952928297755884, "step": 1640}, {"loss": 1.0248, "grad_norm": 0.373780220746994, "learning_rate": 0.0002, "epoch": 1.80623973727422, "step": 1650}, {"loss": 1.0984, "grad_norm": 0.49697014689445496, "learning_rate": 0.0002, "epoch": 1.8171866447728515, "step": 1660}, {"loss": 1.0089, "grad_norm": 1.0308938026428223, "learning_rate": 0.0002, "epoch": 1.8281335522714834, "step": 1670}, {"loss": 1.0853, "grad_norm": 0.4851366877555847, "learning_rate": 0.0002, "epoch": 1.839080459770115, "step": 1680}, {"loss": 0.9533, "grad_norm": 0.3262481391429901, "learning_rate": 0.0002, "epoch": 1.8500273672687466, "step": 1690}, {"loss": 1.048, "grad_norm": 0.6904496550559998, "learning_rate": 0.0002, "epoch": 1.8609742747673783, "step": 1700}, {"loss": 1.0577, "grad_norm": 0.49789851903915405, "learning_rate": 0.0002, "epoch": 1.8719211822660098, "step": 1710}, {"loss": 1.0258, "grad_norm": 0.3035794198513031, "learning_rate": 0.0002, "epoch": 1.8828680897646415, "step": 1720}, {"loss": 0.9916, "grad_norm": 0.4588414430618286, "learning_rate": 0.0002, "epoch": 1.8938149972632732, "step": 1730}, {"loss": 0.9526, "grad_norm": 0.4313034117221832, "learning_rate": 0.0002, "epoch": 1.9047619047619047, "step": 1740}, {"loss": 1.0857, "grad_norm": 0.38562044501304626, "learning_rate": 0.0002, "epoch": 1.9157088122605364, "step": 1750}, {"loss": 0.9763, "grad_norm": 0.46947410702705383, "learning_rate": 0.0002, "epoch": 1.926655719759168, "step": 1760}, {"loss": 0.9967, "grad_norm": 0.3848404884338379, "learning_rate": 0.0002, "epoch": 1.9376026272577995, "step": 1770}, {"loss": 1.0474, "grad_norm": 0.30422744154930115, "learning_rate": 0.0002, "epoch": 1.9485495347564314, "step": 1780}, {"loss": 1.022, "grad_norm": 0.41100990772247314, "learning_rate": 0.0002, "epoch": 1.959496442255063, "step": 1790}, {"loss": 1.003, "grad_norm": 0.3492335081100464, "learning_rate": 0.0002, "epoch": 1.9704433497536946, "step": 1800}, {"loss": 0.9891, "grad_norm": 0.364577978849411, "learning_rate": 0.0002, "epoch": 1.9813902572523263, "step": 1810}, {"loss": 1.0699, "grad_norm": 0.4312075674533844, "learning_rate": 0.0002, "epoch": 1.9923371647509578, "step": 1820}, {"eval_loss": 1.14472496509552, "eval_runtime": 46.0786, "eval_samples_per_second": 9.462, "eval_steps_per_second": 1.194, "epoch": 2.0, "step": 1827}, {"loss": 1.0218, "grad_norm": 0.5989689230918884, "learning_rate": 0.0002, "epoch": 2.0032840722495897, "step": 1830}, {"loss": 0.9384, "grad_norm": 0.49720922112464905, "learning_rate": 0.0002, "epoch": 2.014230979748221, "step": 1840}, {"loss": 0.8482, "grad_norm": 0.42675456404685974, "learning_rate": 0.0002, "epoch": 2.0251778872468527, "step": 1850}, {"loss": 0.8487, "grad_norm": 0.4637208580970764, "learning_rate": 0.0002, "epoch": 2.0361247947454846, "step": 1860}, {"loss": 0.8212, "grad_norm": 0.8329976797103882, "learning_rate": 0.0002, "epoch": 2.047071702244116, "step": 1870}, {"loss": 0.9547, "grad_norm": 0.7869427800178528, "learning_rate": 0.0002, "epoch": 2.0580186097427475, "step": 1880}, {"loss": 0.9351, "grad_norm": 0.4927455186843872, "learning_rate": 0.0002, "epoch": 2.0689655172413794, "step": 1890}, {"loss": 0.8652, "grad_norm": 0.6264246702194214, "learning_rate": 0.0002, "epoch": 2.079912424740011, "step": 1900}, {"loss": 0.9095, "grad_norm": 1.1164122819900513, "learning_rate": 0.0002, "epoch": 2.0908593322386424, "step": 1910}, {"loss": 0.7823, "grad_norm": 0.5283981561660767, "learning_rate": 0.0002, "epoch": 2.1018062397372743, "step": 1920}, {"loss": 0.8065, "grad_norm": 0.45621731877326965, "learning_rate": 0.0002, "epoch": 2.112753147235906, "step": 1930}, {"loss": 0.9184, "grad_norm": 1.381791591644287, "learning_rate": 0.0002, "epoch": 2.1237000547345373, "step": 1940}, {"loss": 0.9006, "grad_norm": 0.5151259899139404, "learning_rate": 0.0002, "epoch": 2.134646962233169, "step": 1950}, {"loss": 0.8436, "grad_norm": 0.9806339740753174, "learning_rate": 0.0002, "epoch": 2.1455938697318007, "step": 1960}, {"loss": 0.8749, "grad_norm": 0.4734154939651489, "learning_rate": 0.0002, "epoch": 2.1565407772304326, "step": 1970}, {"loss": 0.9172, "grad_norm": 0.9553168416023254, "learning_rate": 0.0002, "epoch": 2.167487684729064, "step": 1980}, {"loss": 0.8047, "grad_norm": 0.5895838141441345, "learning_rate": 0.0002, "epoch": 2.1784345922276955, "step": 1990}, {"loss": 0.7841, "grad_norm": 0.4488855302333832, "learning_rate": 0.0002, "epoch": 2.1893814997263275, "step": 2000}, {"loss": 0.8205, "grad_norm": 1.0760235786437988, "learning_rate": 0.0002, "epoch": 2.200328407224959, "step": 2010}, {"loss": 0.7923, "grad_norm": 0.5038785338401794, "learning_rate": 0.0002, "epoch": 2.2112753147235904, "step": 2020}, {"loss": 0.8973, "grad_norm": 0.59819495677948, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 2030}, {"loss": 0.8009, "grad_norm": 0.5012075304985046, "learning_rate": 0.0002, "epoch": 2.233169129720854, "step": 2040}, {"loss": 0.9087, "grad_norm": 0.44978439807891846, "learning_rate": 0.0002, "epoch": 2.2441160372194853, "step": 2050}, {"loss": 0.823, "grad_norm": 0.5350462198257446, "learning_rate": 0.0002, "epoch": 2.255062944718117, "step": 2060}, {"loss": 0.8335, "grad_norm": 0.6020669937133789, "learning_rate": 0.0002, "epoch": 2.2660098522167487, "step": 2070}, {"loss": 0.8023, "grad_norm": 0.5246821045875549, "learning_rate": 0.0002, "epoch": 2.2769567597153806, "step": 2080}, {"loss": 0.8984, "grad_norm": 0.5711268782615662, "learning_rate": 0.0002, "epoch": 2.287903667214012, "step": 2090}, {"loss": 0.9093, "grad_norm": 0.617317259311676, "learning_rate": 0.0002, "epoch": 2.2988505747126435, "step": 2100}, {"loss": 0.8311, "grad_norm": 0.8608947396278381, "learning_rate": 0.0002, "epoch": 2.3097974822112755, "step": 2110}, {"loss": 0.7839, "grad_norm": 0.4739076793193817, "learning_rate": 0.0002, "epoch": 2.320744389709907, "step": 2120}, {"loss": 0.84, "grad_norm": 0.5538856983184814, "learning_rate": 0.0002, "epoch": 2.3316912972085384, "step": 2130}, {"loss": 0.8994, "grad_norm": 0.6064935326576233, "learning_rate": 0.0002, "epoch": 2.3426382047071703, "step": 2140}, {"loss": 0.7765, "grad_norm": 0.5019068121910095, "learning_rate": 0.0002, "epoch": 2.353585112205802, "step": 2150}, {"loss": 0.9576, "grad_norm": 0.45340514183044434, "learning_rate": 0.0002, "epoch": 2.3645320197044333, "step": 2160}, {"loss": 0.9254, "grad_norm": 0.7347203493118286, "learning_rate": 0.0002, "epoch": 2.375478927203065, "step": 2170}, {"loss": 0.8507, "grad_norm": 0.46922534704208374, "learning_rate": 0.0002, "epoch": 2.3864258347016967, "step": 2180}, {"loss": 0.8325, "grad_norm": 0.5507845878601074, "learning_rate": 0.0002, "epoch": 2.3973727422003286, "step": 2190}, {"loss": 0.8178, "grad_norm": 0.5621911883354187, "learning_rate": 0.0002, "epoch": 2.40831964969896, "step": 2200}, {"loss": 0.8569, "grad_norm": 0.5023514032363892, "learning_rate": 0.0002, "epoch": 2.4192665571975915, "step": 2210}, {"loss": 0.8355, "grad_norm": 0.6124861240386963, "learning_rate": 0.0002, "epoch": 2.4302134646962235, "step": 2220}, {"loss": 0.885, "grad_norm": 0.49614205956459045, "learning_rate": 0.0002, "epoch": 2.441160372194855, "step": 2230}, {"loss": 0.8008, "grad_norm": 0.6477900743484497, "learning_rate": 0.0002, "epoch": 2.4521072796934864, "step": 2240}, {"loss": 0.8622, "grad_norm": 0.5868843793869019, "learning_rate": 0.0002, "epoch": 2.4630541871921183, "step": 2250}, {"loss": 0.8498, "grad_norm": 0.4364610016345978, "learning_rate": 0.0002, "epoch": 2.47400109469075, "step": 2260}, {"loss": 0.8378, "grad_norm": 0.5792964696884155, "learning_rate": 0.0002, "epoch": 2.4849480021893813, "step": 2270}, {"loss": 0.8743, "grad_norm": 0.5421269536018372, "learning_rate": 0.0002, "epoch": 2.495894909688013, "step": 2280}, {"loss": 0.9637, "grad_norm": 0.5525493025779724, "learning_rate": 0.0002, "epoch": 2.5068418171866447, "step": 2290}, {"loss": 0.8075, "grad_norm": 0.6463850140571594, "learning_rate": 0.0002, "epoch": 2.5177887246852766, "step": 2300}, {"loss": 0.8591, "grad_norm": 0.6861311793327332, "learning_rate": 0.0002, "epoch": 2.528735632183908, "step": 2310}, {"loss": 0.9287, "grad_norm": 0.5563555359840393, "learning_rate": 0.0002, "epoch": 2.5396825396825395, "step": 2320}, {"loss": 0.945, "grad_norm": 0.5721169114112854, "learning_rate": 0.0002, "epoch": 2.5506294471811715, "step": 2330}, {"loss": 0.8271, "grad_norm": 0.5258274674415588, "learning_rate": 0.0002, "epoch": 2.561576354679803, "step": 2340}, {"loss": 0.8515, "grad_norm": 0.7057380676269531, "learning_rate": 0.0002, "epoch": 2.572523262178435, "step": 2350}, {"loss": 0.8615, "grad_norm": 0.6869027614593506, "learning_rate": 0.0002, "epoch": 2.5834701696770663, "step": 2360}, {"loss": 0.8043, "grad_norm": 0.4960809648036957, "learning_rate": 0.0002, "epoch": 2.594417077175698, "step": 2370}, {"loss": 0.8476, "grad_norm": 0.9288380146026611, "learning_rate": 0.0002, "epoch": 2.6053639846743293, "step": 2380}, {"loss": 0.873, "grad_norm": 0.3765334188938141, "learning_rate": 0.0002, "epoch": 2.616310892172961, "step": 2390}, {"loss": 0.8764, "grad_norm": 0.7487865686416626, "learning_rate": 0.0002, "epoch": 2.6272577996715927, "step": 2400}, {"loss": 0.7577, "grad_norm": 0.6141156554222107, "learning_rate": 0.0002, "epoch": 2.6382047071702246, "step": 2410}, {"loss": 0.8534, "grad_norm": 0.8420507907867432, "learning_rate": 0.0002, "epoch": 2.649151614668856, "step": 2420}, {"loss": 0.8311, "grad_norm": 0.53386390209198, "learning_rate": 0.0002, "epoch": 2.6600985221674875, "step": 2430}, {"loss": 0.8486, "grad_norm": 0.5520607233047485, "learning_rate": 0.0002, "epoch": 2.6710454296661195, "step": 2440}, {"loss": 0.8686, "grad_norm": 0.5337599515914917, "learning_rate": 0.0002, "epoch": 2.681992337164751, "step": 2450}, {"loss": 0.792, "grad_norm": 0.48790836334228516, "learning_rate": 0.0002, "epoch": 2.692939244663383, "step": 2460}, {"loss": 0.7864, "grad_norm": 0.8287786245346069, "learning_rate": 0.0002, "epoch": 2.7038861521620143, "step": 2470}, {"loss": 0.8244, "grad_norm": 0.5876168608665466, "learning_rate": 0.0002, "epoch": 2.714833059660646, "step": 2480}, {"loss": 0.8773, "grad_norm": 0.5206760764122009, "learning_rate": 0.0002, "epoch": 2.7257799671592773, "step": 2490}, {"loss": 0.8097, "grad_norm": 0.5619136691093445, "learning_rate": 0.0002, "epoch": 2.736726874657909, "step": 2500}, {"loss": 0.8377, "grad_norm": 0.5614883899688721, "learning_rate": 0.0002, "epoch": 2.7476737821565407, "step": 2510}, {"loss": 0.8817, "grad_norm": 0.6157700419425964, "learning_rate": 0.0002, "epoch": 2.7586206896551726, "step": 2520}, {"loss": 0.846, "grad_norm": 0.5529953837394714, "learning_rate": 0.0002, "epoch": 2.769567597153804, "step": 2530}, {"loss": 0.8881, "grad_norm": 0.6731224060058594, "learning_rate": 0.0002, "epoch": 2.7805145046524355, "step": 2540}, {"loss": 0.9125, "grad_norm": 0.6960386633872986, "learning_rate": 0.0002, "epoch": 2.7914614121510675, "step": 2550}, {"loss": 0.7823, "grad_norm": 0.5203493237495422, "learning_rate": 0.0002, "epoch": 2.802408319649699, "step": 2560}, {"loss": 0.7951, "grad_norm": 1.036837100982666, "learning_rate": 0.0002, "epoch": 2.813355227148331, "step": 2570}, {"loss": 0.8427, "grad_norm": 0.6125805377960205, "learning_rate": 0.0002, "epoch": 2.8243021346469623, "step": 2580}, {"loss": 0.8939, "grad_norm": 0.6298092603683472, "learning_rate": 0.0002, "epoch": 2.835249042145594, "step": 2590}, {"loss": 0.8845, "grad_norm": 0.5882203578948975, "learning_rate": 0.0002, "epoch": 2.8461959496442253, "step": 2600}, {"loss": 0.8921, "grad_norm": 0.8619399666786194, "learning_rate": 0.0002, "epoch": 2.857142857142857, "step": 2610}, {"loss": 0.8213, "grad_norm": 0.4722687304019928, "learning_rate": 0.0002, "epoch": 2.8680897646414887, "step": 2620}, {"loss": 0.9147, "grad_norm": 0.47399574518203735, "learning_rate": 0.0002, "epoch": 2.8790366721401206, "step": 2630}, {"loss": 0.8325, "grad_norm": 0.5639172792434692, "learning_rate": 0.0002, "epoch": 2.889983579638752, "step": 2640}, {"loss": 0.8653, "grad_norm": 0.4676816761493683, "learning_rate": 0.0002, "epoch": 2.9009304871373836, "step": 2650}, {"loss": 0.8966, "grad_norm": 0.6906291246414185, "learning_rate": 0.0002, "epoch": 2.9118773946360155, "step": 2660}, {"loss": 0.8966, "grad_norm": 0.4369746148586273, "learning_rate": 0.0002, "epoch": 2.922824302134647, "step": 2670}, {"loss": 0.9173, "grad_norm": 0.46423083543777466, "learning_rate": 0.0002, "epoch": 2.933771209633279, "step": 2680}, {"loss": 0.7961, "grad_norm": 0.5700525045394897, "learning_rate": 0.0002, "epoch": 2.9447181171319103, "step": 2690}, {"loss": 0.8172, "grad_norm": 0.6221476793289185, "learning_rate": 0.0002, "epoch": 2.955665024630542, "step": 2700}, {"loss": 0.8538, "grad_norm": 0.6102682948112488, "learning_rate": 0.0002, "epoch": 2.9666119321291733, "step": 2710}, {"loss": 0.9779, "grad_norm": 0.5317878723144531, "learning_rate": 0.0002, "epoch": 2.977558839627805, "step": 2720}, {"loss": 0.9314, "grad_norm": 0.4438510835170746, "learning_rate": 0.0002, "epoch": 2.9885057471264367, "step": 2730}, {"loss": 0.8553, "grad_norm": 0.5022130012512207, "learning_rate": 0.0002, "epoch": 2.9994526546250686, "step": 2740}, {"eval_loss": 1.1722838878631592, "eval_runtime": 46.0829, "eval_samples_per_second": 9.461, "eval_steps_per_second": 1.194, "epoch": 2.9994526546250686, "step": 2740}, {"loss": 0.6443, "grad_norm": 0.6384502053260803, "learning_rate": 0.0002, "epoch": 3.0103995621237, "step": 2750}, {"loss": 0.7123, "grad_norm": 0.9928722381591797, "learning_rate": 0.0002, "epoch": 3.0213464696223316, "step": 2760}, {"loss": 0.6045, "grad_norm": 0.7813051342964172, "learning_rate": 0.0002, "epoch": 3.0322933771209635, "step": 2770}, {"loss": 0.6042, "grad_norm": 1.0202556848526, "learning_rate": 0.0002, "epoch": 3.043240284619595, "step": 2780}, {"loss": 0.6356, "grad_norm": 0.7581062316894531, "learning_rate": 0.0002, "epoch": 3.0541871921182264, "step": 2790}, {"loss": 0.6349, "grad_norm": 0.6252710223197937, "learning_rate": 0.0002, "epoch": 3.0651340996168583, "step": 2800}, {"loss": 0.645, "grad_norm": 0.7738662958145142, "learning_rate": 0.0002, "epoch": 3.07608100711549, "step": 2810}, {"loss": 0.627, "grad_norm": 0.7381885051727295, "learning_rate": 0.0002, "epoch": 3.0870279146141213, "step": 2820}, {"loss": 0.6371, "grad_norm": 0.9197564721107483, "learning_rate": 0.0002, "epoch": 3.097974822112753, "step": 2830}, {"loss": 0.723, "grad_norm": 1.000976800918579, "learning_rate": 0.0002, "epoch": 3.1089217296113847, "step": 2840}, {"loss": 0.6631, "grad_norm": 0.7559131383895874, "learning_rate": 0.0002, "epoch": 3.1198686371100166, "step": 2850}, {"loss": 0.6252, "grad_norm": 0.7213780879974365, "learning_rate": 0.0002, "epoch": 3.130815544608648, "step": 2860}, {"loss": 0.6501, "grad_norm": 0.945939838886261, "learning_rate": 0.0002, "epoch": 3.1417624521072796, "step": 2870}, {"loss": 0.6129, "grad_norm": 0.7277454137802124, "learning_rate": 0.0002, "epoch": 3.1527093596059115, "step": 2880}, {"loss": 0.6423, "grad_norm": 0.762026846408844, "learning_rate": 0.0002, "epoch": 3.163656267104543, "step": 2890}, {"loss": 0.5332, "grad_norm": 0.6471221446990967, "learning_rate": 0.0002, "epoch": 3.1746031746031744, "step": 2900}, {"loss": 0.7981, "grad_norm": 0.6018978357315063, "learning_rate": 0.0002, "epoch": 3.1855500821018063, "step": 2910}, {"loss": 0.7274, "grad_norm": 0.8607320785522461, "learning_rate": 0.0002, "epoch": 3.196496989600438, "step": 2920}, {"loss": 0.6139, "grad_norm": 0.8854126334190369, "learning_rate": 0.0002, "epoch": 3.2074438970990693, "step": 2930}, {"loss": 0.6485, "grad_norm": 0.6620870232582092, "learning_rate": 0.0002, "epoch": 3.218390804597701, "step": 2940}, {"loss": 0.6969, "grad_norm": 0.7377511858940125, "learning_rate": 0.0002, "epoch": 3.2293377120963327, "step": 2950}, {"loss": 0.6798, "grad_norm": 0.7803301811218262, "learning_rate": 0.0002, "epoch": 3.2402846195949646, "step": 2960}, {"loss": 0.6697, "grad_norm": 0.834061861038208, "learning_rate": 0.0002, "epoch": 3.251231527093596, "step": 2970}, {"loss": 0.6894, "grad_norm": 0.8496041893959045, "learning_rate": 0.0002, "epoch": 3.2621784345922276, "step": 2980}, {"loss": 0.6591, "grad_norm": 0.7967984676361084, "learning_rate": 0.0002, "epoch": 3.2731253420908595, "step": 2990}, {"loss": 0.7266, "grad_norm": 1.0207016468048096, "learning_rate": 0.0002, "epoch": 3.284072249589491, "step": 3000}, {"loss": 0.6586, "grad_norm": 0.7855771780014038, "learning_rate": 0.0002, "epoch": 3.2950191570881224, "step": 3010}, {"loss": 0.5711, "grad_norm": 0.9427546858787537, "learning_rate": 0.0002, "epoch": 3.3059660645867543, "step": 3020}, {"loss": 0.6277, "grad_norm": 0.823542594909668, "learning_rate": 0.0002, "epoch": 3.316912972085386, "step": 3030}, {"loss": 0.7109, "grad_norm": 0.9826635122299194, "learning_rate": 0.0002, "epoch": 3.3278598795840173, "step": 3040}, {"loss": 0.6564, "grad_norm": 0.7259827852249146, "learning_rate": 0.0002, "epoch": 3.338806787082649, "step": 3050}, {"loss": 0.653, "grad_norm": 0.7774739861488342, "learning_rate": 0.0002, "epoch": 3.3497536945812807, "step": 3060}, {"loss": 0.7529, "grad_norm": 0.7394293546676636, "learning_rate": 0.0002, "epoch": 3.3607006020799126, "step": 3070}, {"loss": 0.5987, "grad_norm": 0.9017578959465027, "learning_rate": 0.0002, "epoch": 3.371647509578544, "step": 3080}, {"loss": 0.6953, "grad_norm": 0.7451054453849792, "learning_rate": 0.0002, "epoch": 3.3825944170771756, "step": 3090}, {"loss": 0.6759, "grad_norm": 0.7321506142616272, "learning_rate": 0.0002, "epoch": 3.3935413245758075, "step": 3100}, {"loss": 0.6555, "grad_norm": 0.6721828579902649, "learning_rate": 0.0002, "epoch": 3.404488232074439, "step": 3110}, {"loss": 0.6559, "grad_norm": 0.774022102355957, "learning_rate": 0.0002, "epoch": 3.4154351395730704, "step": 3120}, {"loss": 0.7449, "grad_norm": 0.9143537282943726, "learning_rate": 0.0002, "epoch": 3.4263820470717024, "step": 3130}, {"loss": 0.6899, "grad_norm": 1.226087212562561, "learning_rate": 0.0002, "epoch": 3.437328954570334, "step": 3140}, {"loss": 0.6719, "grad_norm": 0.7545496225357056, "learning_rate": 0.0002, "epoch": 3.4482758620689653, "step": 3150}, {"loss": 0.6153, "grad_norm": 0.6515635848045349, "learning_rate": 0.0002, "epoch": 3.4592227695675972, "step": 3160}, {"loss": 0.6926, "grad_norm": 0.9297090172767639, "learning_rate": 0.0002, "epoch": 3.4701696770662287, "step": 3170}, {"loss": 0.6071, "grad_norm": 1.0130730867385864, "learning_rate": 0.0002, "epoch": 3.4811165845648606, "step": 3180}, {"loss": 0.5959, "grad_norm": 0.7654589414596558, "learning_rate": 0.0002, "epoch": 3.492063492063492, "step": 3190}, {"loss": 0.7401, "grad_norm": 0.9954977631568909, "learning_rate": 0.0002, "epoch": 3.5030103995621236, "step": 3200}, {"loss": 0.6661, "grad_norm": 0.6027487516403198, "learning_rate": 0.0002, "epoch": 3.5139573070607555, "step": 3210}, {"loss": 0.6963, "grad_norm": 0.741770327091217, "learning_rate": 0.0002, "epoch": 3.524904214559387, "step": 3220}, {"loss": 0.8112, "grad_norm": 1.0534909963607788, "learning_rate": 0.0002, "epoch": 3.535851122058019, "step": 3230}, {"loss": 0.6813, "grad_norm": 0.937772274017334, "learning_rate": 0.0002, "epoch": 3.5467980295566504, "step": 3240}, {"loss": 0.6681, "grad_norm": 0.8504213690757751, "learning_rate": 0.0002, "epoch": 3.557744937055282, "step": 3250}, {"loss": 0.6436, "grad_norm": 0.7755007147789001, "learning_rate": 0.0002, "epoch": 3.5686918445539133, "step": 3260}, {"loss": 0.6213, "grad_norm": 1.0193358659744263, "learning_rate": 0.0002, "epoch": 3.5796387520525452, "step": 3270}, {"loss": 0.671, "grad_norm": 0.8440536856651306, "learning_rate": 0.0002, "epoch": 3.5905856595511767, "step": 3280}, {"loss": 0.6859, "grad_norm": 0.6195939183235168, "learning_rate": 0.0002, "epoch": 3.6015325670498086, "step": 3290}, {"loss": 0.7446, "grad_norm": 0.8608590960502625, "learning_rate": 0.0002, "epoch": 3.61247947454844, "step": 3300}, {"loss": 0.7301, "grad_norm": 0.6772327423095703, "learning_rate": 0.0002, "epoch": 3.6234263820470716, "step": 3310}, {"loss": 0.6298, "grad_norm": 0.8031839728355408, "learning_rate": 0.0002, "epoch": 3.6343732895457035, "step": 3320}, {"loss": 0.7041, "grad_norm": 0.6080502271652222, "learning_rate": 0.0002, "epoch": 3.645320197044335, "step": 3330}, {"loss": 0.7431, "grad_norm": 0.8007240891456604, "learning_rate": 0.0002, "epoch": 3.656267104542967, "step": 3340}, {"loss": 0.7446, "grad_norm": 0.8060704469680786, "learning_rate": 0.0002, "epoch": 3.6672140120415984, "step": 3350}, {"loss": 0.6304, "grad_norm": 0.7547586560249329, "learning_rate": 0.0002, "epoch": 3.67816091954023, "step": 3360}, {"loss": 0.7066, "grad_norm": 0.686851978302002, "learning_rate": 0.0002, "epoch": 3.6891078270388613, "step": 3370}, {"loss": 0.6748, "grad_norm": 0.9429075717926025, "learning_rate": 0.0002, "epoch": 3.7000547345374932, "step": 3380}, {"loss": 0.6673, "grad_norm": 0.7283591032028198, "learning_rate": 0.0002, "epoch": 3.7110016420361247, "step": 3390}, {"loss": 0.7502, "grad_norm": 0.8323085904121399, "learning_rate": 0.0002, "epoch": 3.7219485495347566, "step": 3400}, {"loss": 0.7779, "grad_norm": 0.8529590964317322, "learning_rate": 0.0002, "epoch": 3.732895457033388, "step": 3410}, {"loss": 0.6555, "grad_norm": 0.731752872467041, "learning_rate": 0.0002, "epoch": 3.7438423645320196, "step": 3420}, {"loss": 0.6928, "grad_norm": 0.8572278618812561, "learning_rate": 0.0002, "epoch": 3.7547892720306515, "step": 3430}, {"loss": 0.6215, "grad_norm": 0.7408691048622131, "learning_rate": 0.0002, "epoch": 3.765736179529283, "step": 3440}, {"loss": 0.622, "grad_norm": 0.7470445036888123, "learning_rate": 0.0002, "epoch": 3.776683087027915, "step": 3450}, {"loss": 0.7241, "grad_norm": 0.6806244254112244, "learning_rate": 0.0002, "epoch": 3.7876299945265464, "step": 3460}, {"loss": 0.7739, "grad_norm": 0.9129069447517395, "learning_rate": 0.0002, "epoch": 3.798576902025178, "step": 3470}, {"loss": 0.6826, "grad_norm": 0.8717501759529114, "learning_rate": 0.0002, "epoch": 3.8095238095238093, "step": 3480}, {"loss": 0.6188, "grad_norm": 0.6761979460716248, "learning_rate": 0.0002, "epoch": 3.8204707170224412, "step": 3490}, {"loss": 0.7601, "grad_norm": 1.0054380893707275, "learning_rate": 0.0002, "epoch": 3.8314176245210727, "step": 3500}, {"loss": 0.623, "grad_norm": 1.1224009990692139, "learning_rate": 0.0002, "epoch": 3.8423645320197046, "step": 3510}, {"loss": 0.6918, "grad_norm": 0.8997692465782166, "learning_rate": 0.0002, "epoch": 3.853311439518336, "step": 3520}, {"loss": 0.6357, "grad_norm": 1.0086902379989624, "learning_rate": 0.0002, "epoch": 3.8642583470169676, "step": 3530}, {"loss": 0.6379, "grad_norm": 0.772739589214325, "learning_rate": 0.0002, "epoch": 3.8752052545155995, "step": 3540}, {"loss": 0.7423, "grad_norm": 1.211774230003357, "learning_rate": 0.0002, "epoch": 3.886152162014231, "step": 3550}, {"loss": 0.7321, "grad_norm": 0.9572356939315796, "learning_rate": 0.0002, "epoch": 3.897099069512863, "step": 3560}, {"loss": 0.6836, "grad_norm": 0.7887842655181885, "learning_rate": 0.0002, "epoch": 3.9080459770114944, "step": 3570}, {"loss": 0.7576, "grad_norm": 0.7308389544487, "learning_rate": 0.0002, "epoch": 3.918992884510126, "step": 3580}, {"loss": 0.6001, "grad_norm": 1.0182650089263916, "learning_rate": 0.0002, "epoch": 3.9299397920087573, "step": 3590}, {"loss": 0.6942, "grad_norm": 0.8000147342681885, "learning_rate": 0.0002, "epoch": 3.9408866995073892, "step": 3600}, {"loss": 0.6244, "grad_norm": 0.7385728359222412, "learning_rate": 0.0002, "epoch": 3.9518336070060207, "step": 3610}, {"loss": 0.6718, "grad_norm": 0.9233261942863464, "learning_rate": 0.0002, "epoch": 3.9627805145046526, "step": 3620}, {"loss": 0.6508, "grad_norm": 0.8486751914024353, "learning_rate": 0.0002, "epoch": 3.973727422003284, "step": 3630}, {"loss": 0.6928, "grad_norm": 0.7593663334846497, "learning_rate": 0.0002, "epoch": 3.9846743295019156, "step": 3640}, {"loss": 0.6847, "grad_norm": 0.7885415554046631, "learning_rate": 0.0002, "epoch": 3.9956212370005475, "step": 3650}, {"eval_loss": 1.250312328338623, "eval_runtime": 46.0842, "eval_samples_per_second": 9.461, "eval_steps_per_second": 1.193, "epoch": 4.0, "step": 3654}, {"loss": 0.5547, "grad_norm": 0.6591703295707703, "learning_rate": 0.0002, "epoch": 4.006568144499179, "step": 3660}, {"loss": 0.5301, "grad_norm": 1.36927330493927, "learning_rate": 0.0002, "epoch": 4.017515051997811, "step": 3670}, {"loss": 0.4466, "grad_norm": 0.8106328845024109, "learning_rate": 0.0002, "epoch": 4.028461959496442, "step": 3680}, {"loss": 0.4861, "grad_norm": 0.7592712044715881, "learning_rate": 0.0002, "epoch": 4.039408866995074, "step": 3690}, {"loss": 0.5103, "grad_norm": 0.9518909454345703, "learning_rate": 0.0002, "epoch": 4.050355774493705, "step": 3700}, {"loss": 0.4638, "grad_norm": 0.7805967330932617, "learning_rate": 0.0002, "epoch": 4.061302681992337, "step": 3710}, {"loss": 0.4556, "grad_norm": 1.3146334886550903, "learning_rate": 0.0002, "epoch": 4.072249589490969, "step": 3720}, {"loss": 0.5635, "grad_norm": 1.1611138582229614, "learning_rate": 0.0002, "epoch": 4.083196496989601, "step": 3730}, {"loss": 0.3845, "grad_norm": 0.8173232078552246, "learning_rate": 0.0002, "epoch": 4.094143404488232, "step": 3740}, {"loss": 0.4911, "grad_norm": 0.7848323583602905, "learning_rate": 0.0002, "epoch": 4.105090311986864, "step": 3750}, {"loss": 0.4519, "grad_norm": 1.3183201551437378, "learning_rate": 0.0002, "epoch": 4.116037219485495, "step": 3760}, {"loss": 0.5083, "grad_norm": 1.1936529874801636, "learning_rate": 0.0002, "epoch": 4.1269841269841265, "step": 3770}, {"loss": 0.5208, "grad_norm": 1.1078993082046509, "learning_rate": 0.0002, "epoch": 4.137931034482759, "step": 3780}, {"loss": 0.5928, "grad_norm": 1.107743263244629, "learning_rate": 0.0002, "epoch": 4.14887794198139, "step": 3790}, {"loss": 0.5112, "grad_norm": 0.7801875472068787, "learning_rate": 0.0002, "epoch": 4.159824849480022, "step": 3800}, {"loss": 0.4896, "grad_norm": 1.1328117847442627, "learning_rate": 0.0002, "epoch": 4.170771756978653, "step": 3810}, {"loss": 0.5645, "grad_norm": 1.4232193231582642, "learning_rate": 0.0002, "epoch": 4.181718664477285, "step": 3820}, {"loss": 0.5049, "grad_norm": 1.557416558265686, "learning_rate": 0.0002, "epoch": 4.192665571975917, "step": 3830}, {"loss": 0.4863, "grad_norm": 1.042923092842102, "learning_rate": 0.0002, "epoch": 4.203612479474549, "step": 3840}, {"loss": 0.3751, "grad_norm": 1.1801949739456177, "learning_rate": 0.0002, "epoch": 4.21455938697318, "step": 3850}, {"loss": 0.5063, "grad_norm": 0.9273753762245178, "learning_rate": 0.0002, "epoch": 4.225506294471812, "step": 3860}, {"loss": 0.5542, "grad_norm": 0.7681763768196106, "learning_rate": 0.0002, "epoch": 4.236453201970443, "step": 3870}, {"loss": 0.5971, "grad_norm": 0.9840841293334961, "learning_rate": 0.0002, "epoch": 4.2474001094690745, "step": 3880}, {"loss": 0.4648, "grad_norm": 1.0290725231170654, "learning_rate": 0.0002, "epoch": 4.258347016967707, "step": 3890}, {"loss": 0.4288, "grad_norm": 0.8059597611427307, "learning_rate": 0.0002, "epoch": 4.269293924466338, "step": 3900}, {"loss": 0.5103, "grad_norm": 0.9847467541694641, "learning_rate": 0.0002, "epoch": 4.28024083196497, "step": 3910}, {"loss": 0.4952, "grad_norm": 1.344044804573059, "learning_rate": 0.0002, "epoch": 4.291187739463601, "step": 3920}, {"loss": 0.4966, "grad_norm": 0.9174224138259888, "learning_rate": 0.0002, "epoch": 4.302134646962233, "step": 3930}, {"loss": 0.4944, "grad_norm": 1.1199711561203003, "learning_rate": 0.0002, "epoch": 4.313081554460865, "step": 3940}, {"loss": 0.4641, "grad_norm": 1.0120296478271484, "learning_rate": 0.0002, "epoch": 4.324028461959497, "step": 3950}, {"loss": 0.4723, "grad_norm": 1.091811180114746, "learning_rate": 0.0002, "epoch": 4.334975369458128, "step": 3960}, {"loss": 0.4627, "grad_norm": 1.0332133769989014, "learning_rate": 0.0002, "epoch": 4.34592227695676, "step": 3970}, {"loss": 0.4646, "grad_norm": 1.0785295963287354, "learning_rate": 0.0002, "epoch": 4.356869184455391, "step": 3980}, {"loss": 0.4909, "grad_norm": 1.0506969690322876, "learning_rate": 0.0002, "epoch": 4.3678160919540225, "step": 3990}, {"loss": 0.4776, "grad_norm": 1.047560691833496, "learning_rate": 0.0002, "epoch": 4.378762999452655, "step": 4000}, {"loss": 0.4549, "grad_norm": 0.9348800778388977, "learning_rate": 0.0002, "epoch": 4.389709906951286, "step": 4010}, {"loss": 0.5333, "grad_norm": 1.1563059091567993, "learning_rate": 0.0002, "epoch": 4.400656814449918, "step": 4020}, {"loss": 0.4952, "grad_norm": 1.001470923423767, "learning_rate": 0.0002, "epoch": 4.411603721948549, "step": 4030}, {"loss": 0.4972, "grad_norm": 1.309012532234192, "learning_rate": 0.0002, "epoch": 4.422550629447181, "step": 4040}, {"loss": 0.5078, "grad_norm": 0.7338925004005432, "learning_rate": 0.0002, "epoch": 4.433497536945813, "step": 4050}, {"loss": 0.4632, "grad_norm": 1.0398834943771362, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 4060}, {"loss": 0.6285, "grad_norm": 0.9728689193725586, "learning_rate": 0.0002, "epoch": 4.455391351943076, "step": 4070}, {"loss": 0.4778, "grad_norm": 1.247475028038025, "learning_rate": 0.0002, "epoch": 4.466338259441708, "step": 4080}, {"loss": 0.4813, "grad_norm": 1.1084578037261963, "learning_rate": 0.0002, "epoch": 4.477285166940339, "step": 4090}, {"loss": 0.5665, "grad_norm": 1.1619318723678589, "learning_rate": 0.0002, "epoch": 4.4882320744389705, "step": 4100}, {"loss": 0.5207, "grad_norm": 1.3456498384475708, "learning_rate": 0.0002, "epoch": 4.499178981937603, "step": 4110}, {"loss": 0.4876, "grad_norm": 0.9372991323471069, "learning_rate": 0.0002, "epoch": 4.510125889436234, "step": 4120}, {"loss": 0.5456, "grad_norm": 1.0071815252304077, "learning_rate": 0.0002, "epoch": 4.521072796934866, "step": 4130}, {"loss": 0.5589, "grad_norm": 1.190344214439392, "learning_rate": 0.0002, "epoch": 4.532019704433497, "step": 4140}, {"loss": 0.4852, "grad_norm": 0.9480887055397034, "learning_rate": 0.0002, "epoch": 4.542966611932129, "step": 4150}, {"loss": 0.5229, "grad_norm": 1.0252189636230469, "learning_rate": 0.0002, "epoch": 4.553913519430761, "step": 4160}, {"loss": 0.5253, "grad_norm": 0.7142013311386108, "learning_rate": 0.0002, "epoch": 4.564860426929393, "step": 4170}, {"loss": 0.4861, "grad_norm": 0.8937426805496216, "learning_rate": 0.0002, "epoch": 4.575807334428024, "step": 4180}, {"loss": 0.4773, "grad_norm": 0.8885005116462708, "learning_rate": 0.0002, "epoch": 4.586754241926656, "step": 4190}, {"loss": 0.4858, "grad_norm": 1.337663173675537, "learning_rate": 0.0002, "epoch": 4.597701149425287, "step": 4200}, {"loss": 0.5247, "grad_norm": 1.0475375652313232, "learning_rate": 0.0002, "epoch": 4.6086480569239185, "step": 4210}, {"loss": 0.5298, "grad_norm": 1.0081088542938232, "learning_rate": 0.0002, "epoch": 4.619594964422551, "step": 4220}, {"loss": 0.5042, "grad_norm": 0.7527595162391663, "learning_rate": 0.0002, "epoch": 4.630541871921182, "step": 4230}, {"loss": 0.5207, "grad_norm": 1.55559241771698, "learning_rate": 0.0002, "epoch": 4.641488779419814, "step": 4240}, {"loss": 0.5468, "grad_norm": 0.7967379689216614, "learning_rate": 0.0002, "epoch": 4.652435686918445, "step": 4250}, {"loss": 0.5328, "grad_norm": 0.898368775844574, "learning_rate": 0.0002, "epoch": 4.663382594417077, "step": 4260}, {"loss": 0.4706, "grad_norm": 1.1940776109695435, "learning_rate": 0.0002, "epoch": 4.674329501915709, "step": 4270}, {"loss": 0.5121, "grad_norm": 1.1817092895507812, "learning_rate": 0.0002, "epoch": 4.685276409414341, "step": 4280}, {"loss": 0.5758, "grad_norm": 0.9041520357131958, "learning_rate": 0.0002, "epoch": 4.696223316912972, "step": 4290}, {"loss": 0.5851, "grad_norm": 1.1280102729797363, "learning_rate": 0.0002, "epoch": 4.707170224411604, "step": 4300}, {"loss": 0.4891, "grad_norm": 1.357689619064331, "learning_rate": 0.0002, "epoch": 4.718117131910235, "step": 4310}, {"loss": 0.4704, "grad_norm": 1.056633472442627, "learning_rate": 0.0002, "epoch": 4.7290640394088665, "step": 4320}, {"loss": 0.5488, "grad_norm": 1.6520427465438843, "learning_rate": 0.0002, "epoch": 4.740010946907499, "step": 4330}, {"loss": 0.5131, "grad_norm": 1.153200626373291, "learning_rate": 0.0002, "epoch": 4.75095785440613, "step": 4340}, {"loss": 0.539, "grad_norm": 0.9346241354942322, "learning_rate": 0.0002, "epoch": 4.761904761904762, "step": 4350}, {"loss": 0.4941, "grad_norm": 0.8628455996513367, "learning_rate": 0.0002, "epoch": 4.772851669403393, "step": 4360}, {"loss": 0.5167, "grad_norm": 1.3843916654586792, "learning_rate": 0.0002, "epoch": 4.783798576902025, "step": 4370}, {"loss": 0.4683, "grad_norm": 1.035574197769165, "learning_rate": 0.0002, "epoch": 4.794745484400657, "step": 4380}, {"loss": 0.5162, "grad_norm": 1.1868361234664917, "learning_rate": 0.0002, "epoch": 4.805692391899289, "step": 4390}, {"loss": 0.534, "grad_norm": 1.1307647228240967, "learning_rate": 0.0002, "epoch": 4.81663929939792, "step": 4400}, {"loss": 0.5567, "grad_norm": 0.9787724614143372, "learning_rate": 0.0002, "epoch": 4.827586206896552, "step": 4410}, {"loss": 0.5185, "grad_norm": 1.0473824739456177, "learning_rate": 0.0002, "epoch": 4.838533114395183, "step": 4420}, {"loss": 0.6285, "grad_norm": 1.069069504737854, "learning_rate": 0.0002, "epoch": 4.8494800218938146, "step": 4430}, {"loss": 0.5267, "grad_norm": 1.4305680990219116, "learning_rate": 0.0002, "epoch": 4.860426929392447, "step": 4440}, {"loss": 0.5947, "grad_norm": 1.3679203987121582, "learning_rate": 0.0002, "epoch": 4.871373836891078, "step": 4450}, {"loss": 0.5135, "grad_norm": 0.8997844457626343, "learning_rate": 0.0002, "epoch": 4.88232074438971, "step": 4460}, {"loss": 0.5312, "grad_norm": 1.2758110761642456, "learning_rate": 0.0002, "epoch": 4.893267651888341, "step": 4470}, {"loss": 0.4914, "grad_norm": 0.8819465637207031, "learning_rate": 0.0002, "epoch": 4.904214559386973, "step": 4480}, {"loss": 0.5147, "grad_norm": 1.08329439163208, "learning_rate": 0.0002, "epoch": 4.915161466885605, "step": 4490}, {"loss": 0.5404, "grad_norm": 1.083461046218872, "learning_rate": 0.0002, "epoch": 4.926108374384237, "step": 4500}, {"loss": 0.5433, "grad_norm": 1.2387723922729492, "learning_rate": 0.0002, "epoch": 4.937055281882868, "step": 4510}, {"loss": 0.5624, "grad_norm": 0.8262293934822083, "learning_rate": 0.0002, "epoch": 4.9480021893815, "step": 4520}, {"loss": 0.504, "grad_norm": 1.2325191497802734, "learning_rate": 0.0002, "epoch": 4.958949096880131, "step": 4530}, {"loss": 0.5452, "grad_norm": 1.024614930152893, "learning_rate": 0.0002, "epoch": 4.9698960043787626, "step": 4540}, {"loss": 0.4752, "grad_norm": 1.3007521629333496, "learning_rate": 0.0002, "epoch": 4.980842911877395, "step": 4550}, {"loss": 0.4943, "grad_norm": 0.9823828339576721, "learning_rate": 0.0002, "epoch": 4.991789819376026, "step": 4560}, {"eval_loss": 1.3920727968215942, "eval_runtime": 46.0764, "eval_samples_per_second": 9.463, "eval_steps_per_second": 1.194, "epoch": 4.999452654625069, "step": 4567}, {"loss": 0.545, "grad_norm": 1.1478906869888306, "learning_rate": 0.0002, "epoch": 5.002736726874658, "step": 4570}, {"loss": 0.372, "grad_norm": 1.0533705949783325, "learning_rate": 0.0002, "epoch": 5.013683634373289, "step": 4580}, {"loss": 0.3313, "grad_norm": 1.268900752067566, "learning_rate": 0.0002, "epoch": 5.024630541871921, "step": 4590}, {"loss": 0.3482, "grad_norm": 1.222652554512024, "learning_rate": 0.0002, "epoch": 5.035577449370553, "step": 4600}, {"loss": 0.3195, "grad_norm": 1.5093127489089966, "learning_rate": 0.0002, "epoch": 5.046524356869185, "step": 4610}, {"loss": 0.3569, "grad_norm": 1.2372499704360962, "learning_rate": 0.0002, "epoch": 5.057471264367816, "step": 4620}, {"loss": 0.3206, "grad_norm": 0.8422666192054749, "learning_rate": 0.0002, "epoch": 5.068418171866448, "step": 4630}, {"loss": 0.3115, "grad_norm": 1.1451770067214966, "learning_rate": 0.0002, "epoch": 5.079365079365079, "step": 4640}, {"loss": 0.3305, "grad_norm": 1.2074557542800903, "learning_rate": 0.0002, "epoch": 5.090311986863711, "step": 4650}, {"loss": 0.3012, "grad_norm": 1.429150104522705, "learning_rate": 0.0002, "epoch": 5.101258894362343, "step": 4660}, {"loss": 0.3229, "grad_norm": 1.0353610515594482, "learning_rate": 0.0002, "epoch": 5.112205801860974, "step": 4670}, {"loss": 0.402, "grad_norm": 1.2845979928970337, "learning_rate": 0.0002, "epoch": 5.123152709359606, "step": 4680}, {"loss": 0.383, "grad_norm": 1.3790186643600464, "learning_rate": 0.0002, "epoch": 5.134099616858237, "step": 4690}, {"loss": 0.2951, "grad_norm": 1.3182239532470703, "learning_rate": 0.0002, "epoch": 5.145046524356869, "step": 4700}, {"loss": 0.4074, "grad_norm": 1.5249626636505127, "learning_rate": 0.0002, "epoch": 5.155993431855501, "step": 4710}, {"loss": 0.3703, "grad_norm": 1.2492733001708984, "learning_rate": 0.0002, "epoch": 5.166940339354133, "step": 4720}, {"loss": 0.3411, "grad_norm": 1.4455480575561523, "learning_rate": 0.0002, "epoch": 5.177887246852764, "step": 4730}, {"loss": 0.3996, "grad_norm": 1.2191482782363892, "learning_rate": 0.0002, "epoch": 5.188834154351396, "step": 4740}, {"loss": 0.3785, "grad_norm": 1.4707951545715332, "learning_rate": 0.0002, "epoch": 5.199781061850027, "step": 4750}, {"loss": 0.3516, "grad_norm": 1.3473678827285767, "learning_rate": 0.0002, "epoch": 5.210727969348659, "step": 4760}, {"loss": 0.3266, "grad_norm": 1.0479670763015747, "learning_rate": 0.0002, "epoch": 5.221674876847291, "step": 4770}, {"loss": 0.3976, "grad_norm": 1.299096703529358, "learning_rate": 0.0002, "epoch": 5.232621784345922, "step": 4780}, {"loss": 0.3266, "grad_norm": 1.2820168733596802, "learning_rate": 0.0002, "epoch": 5.243568691844554, "step": 4790}, {"loss": 0.3347, "grad_norm": 1.3818004131317139, "learning_rate": 0.0002, "epoch": 5.254515599343185, "step": 4800}, {"loss": 0.3761, "grad_norm": 1.2898736000061035, "learning_rate": 0.0002, "epoch": 5.265462506841817, "step": 4810}, {"loss": 0.3694, "grad_norm": 1.1761468648910522, "learning_rate": 0.0002, "epoch": 5.276409414340449, "step": 4820}, {"loss": 0.3806, "grad_norm": 1.7155952453613281, "learning_rate": 0.0002, "epoch": 5.287356321839081, "step": 4830}, {"loss": 0.322, "grad_norm": 0.9103642106056213, "learning_rate": 0.0002, "epoch": 5.298303229337712, "step": 4840}, {"loss": 0.3516, "grad_norm": 1.013015627861023, "learning_rate": 0.0002, "epoch": 5.309250136836344, "step": 4850}, {"loss": 0.4297, "grad_norm": 1.390471339225769, "learning_rate": 0.0002, "epoch": 5.320197044334975, "step": 4860}, {"loss": 0.4098, "grad_norm": 1.129770278930664, "learning_rate": 0.0002, "epoch": 5.331143951833607, "step": 4870}, {"loss": 0.4227, "grad_norm": 1.1461067199707031, "learning_rate": 0.0002, "epoch": 5.342090859332239, "step": 4880}, {"loss": 0.288, "grad_norm": 1.3587424755096436, "learning_rate": 0.0002, "epoch": 5.35303776683087, "step": 4890}, {"loss": 0.3604, "grad_norm": 1.6897879838943481, "learning_rate": 0.0002, "epoch": 5.363984674329502, "step": 4900}, {"loss": 0.3887, "grad_norm": 0.9298055768013, "learning_rate": 0.0002, "epoch": 5.374931581828133, "step": 4910}, {"loss": 0.3371, "grad_norm": 1.0006917715072632, "learning_rate": 0.0002, "epoch": 5.385878489326765, "step": 4920}, {"loss": 0.3992, "grad_norm": 1.232581377029419, "learning_rate": 0.0002, "epoch": 5.396825396825397, "step": 4930}, {"loss": 0.3456, "grad_norm": 1.0822620391845703, "learning_rate": 0.0002, "epoch": 5.407772304324029, "step": 4940}, {"loss": 0.3806, "grad_norm": 1.3648720979690552, "learning_rate": 0.0002, "epoch": 5.41871921182266, "step": 4950}, {"loss": 0.3959, "grad_norm": 1.3220354318618774, "learning_rate": 0.0002, "epoch": 5.429666119321292, "step": 4960}, {"loss": 0.3278, "grad_norm": 1.1106271743774414, "learning_rate": 0.0002, "epoch": 5.440613026819923, "step": 4970}, {"loss": 0.3812, "grad_norm": 1.6058908700942993, "learning_rate": 0.0002, "epoch": 5.451559934318555, "step": 4980}, {"loss": 0.3905, "grad_norm": 1.1065930128097534, "learning_rate": 0.0002, "epoch": 5.462506841817187, "step": 4990}, {"loss": 0.4058, "grad_norm": 1.3896466493606567, "learning_rate": 0.0002, "epoch": 5.473453749315818, "step": 5000}, {"loss": 0.4122, "grad_norm": 1.0437148809432983, "learning_rate": 0.0002, "epoch": 5.48440065681445, "step": 5010}, {"loss": 0.4065, "grad_norm": 1.2347718477249146, "learning_rate": 0.0002, "epoch": 5.495347564313081, "step": 5020}, {"loss": 0.3586, "grad_norm": 1.1174284219741821, "learning_rate": 0.0002, "epoch": 5.506294471811713, "step": 5030}, {"loss": 0.3576, "grad_norm": 1.2580941915512085, "learning_rate": 0.0002, "epoch": 5.517241379310345, "step": 5040}, {"loss": 0.3809, "grad_norm": 1.451090931892395, "learning_rate": 0.0002, "epoch": 5.528188286808977, "step": 5050}, {"loss": 0.3645, "grad_norm": 1.4688365459442139, "learning_rate": 0.0002, "epoch": 5.539135194307608, "step": 5060}, {"loss": 0.4431, "grad_norm": 1.1625734567642212, "learning_rate": 0.0002, "epoch": 5.55008210180624, "step": 5070}, {"loss": 0.3972, "grad_norm": 0.9332265257835388, "learning_rate": 0.0002, "epoch": 5.561029009304871, "step": 5080}, {"loss": 0.4, "grad_norm": 1.5635273456573486, "learning_rate": 0.0002, "epoch": 5.571975916803503, "step": 5090}, {"loss": 0.3651, "grad_norm": 1.3420509099960327, "learning_rate": 0.0002, "epoch": 5.582922824302135, "step": 5100}, {"loss": 0.3717, "grad_norm": 1.5826557874679565, "learning_rate": 0.0002, "epoch": 5.593869731800766, "step": 5110}, {"loss": 0.4256, "grad_norm": 1.5737065076828003, "learning_rate": 0.0002, "epoch": 5.604816639299398, "step": 5120}, {"loss": 0.39, "grad_norm": 1.3812499046325684, "learning_rate": 0.0002, "epoch": 5.615763546798029, "step": 5130}, {"loss": 0.3891, "grad_norm": 1.362833023071289, "learning_rate": 0.0002, "epoch": 5.626710454296661, "step": 5140}, {"loss": 0.455, "grad_norm": 1.7667874097824097, "learning_rate": 0.0002, "epoch": 5.637657361795293, "step": 5150}, {"loss": 0.4264, "grad_norm": 1.2661789655685425, "learning_rate": 0.0002, "epoch": 5.648604269293925, "step": 5160}, {"loss": 0.3261, "grad_norm": 1.2076870203018188, "learning_rate": 0.0002, "epoch": 5.659551176792556, "step": 5170}, {"loss": 0.372, "grad_norm": 1.2431524991989136, "learning_rate": 0.0002, "epoch": 5.670498084291188, "step": 5180}, {"loss": 0.4092, "grad_norm": 1.2216639518737793, "learning_rate": 0.0002, "epoch": 5.681444991789819, "step": 5190}, {"loss": 0.4171, "grad_norm": 0.9259352684020996, "learning_rate": 0.0002, "epoch": 5.692391899288451, "step": 5200}, {"loss": 0.3875, "grad_norm": 1.7929338216781616, "learning_rate": 0.0002, "epoch": 5.703338806787083, "step": 5210}, {"loss": 0.4424, "grad_norm": 1.4048460721969604, "learning_rate": 0.0002, "epoch": 5.714285714285714, "step": 5220}, {"loss": 0.3758, "grad_norm": 1.306874394416809, "learning_rate": 0.0002, "epoch": 5.725232621784346, "step": 5230}, {"loss": 0.3889, "grad_norm": 1.3137940168380737, "learning_rate": 0.0002, "epoch": 5.736179529282977, "step": 5240}, {"loss": 0.4804, "grad_norm": 1.1376476287841797, "learning_rate": 0.0002, "epoch": 5.747126436781609, "step": 5250}, {"loss": 0.377, "grad_norm": 1.450939416885376, "learning_rate": 0.0002, "epoch": 5.758073344280241, "step": 5260}, {"loss": 0.4732, "grad_norm": 0.983195960521698, "learning_rate": 0.0002, "epoch": 5.769020251778873, "step": 5270}, {"loss": 0.4041, "grad_norm": 1.66558837890625, "learning_rate": 0.0002, "epoch": 5.779967159277504, "step": 5280}, {"loss": 0.3643, "grad_norm": 0.9789204597473145, "learning_rate": 0.0002, "epoch": 5.790914066776136, "step": 5290}, {"loss": 0.3776, "grad_norm": 1.2110556364059448, "learning_rate": 0.0002, "epoch": 5.801860974274767, "step": 5300}, {"loss": 0.4049, "grad_norm": 1.3799304962158203, "learning_rate": 0.0002, "epoch": 5.812807881773399, "step": 5310}, {"loss": 0.4362, "grad_norm": 1.0570626258850098, "learning_rate": 0.0002, "epoch": 5.823754789272031, "step": 5320}, {"loss": 0.4716, "grad_norm": 1.4654436111450195, "learning_rate": 0.0002, "epoch": 5.834701696770662, "step": 5330}, {"loss": 0.4048, "grad_norm": 1.5216940641403198, "learning_rate": 0.0002, "epoch": 5.845648604269294, "step": 5340}, {"loss": 0.3848, "grad_norm": 1.018646001815796, "learning_rate": 0.0002, "epoch": 5.856595511767925, "step": 5350}, {"loss": 0.3705, "grad_norm": 1.028951644897461, "learning_rate": 0.0002, "epoch": 5.867542419266557, "step": 5360}, {"loss": 0.4213, "grad_norm": 2.571263313293457, "learning_rate": 0.0002, "epoch": 5.878489326765189, "step": 5370}, {"loss": 0.3647, "grad_norm": 1.3323984146118164, "learning_rate": 0.0002, "epoch": 5.889436234263821, "step": 5380}, {"loss": 0.4085, "grad_norm": 1.4317777156829834, "learning_rate": 0.0002, "epoch": 5.900383141762452, "step": 5390}, {"loss": 0.4254, "grad_norm": 1.4289140701293945, "learning_rate": 0.0002, "epoch": 5.911330049261084, "step": 5400}, {"loss": 0.3993, "grad_norm": 1.3130780458450317, "learning_rate": 0.0002, "epoch": 5.922276956759715, "step": 5410}, {"loss": 0.4025, "grad_norm": 1.3979902267456055, "learning_rate": 0.0002, "epoch": 5.933223864258347, "step": 5420}, {"loss": 0.3997, "grad_norm": 1.1827352046966553, "learning_rate": 0.0002, "epoch": 5.944170771756979, "step": 5430}, {"loss": 0.4163, "grad_norm": 1.1672080755233765, "learning_rate": 0.0002, "epoch": 5.95511767925561, "step": 5440}, {"loss": 0.4425, "grad_norm": 1.0949620008468628, "learning_rate": 0.0002, "epoch": 5.966064586754242, "step": 5450}, {"loss": 0.4219, "grad_norm": 1.3183925151824951, "learning_rate": 0.0002, "epoch": 5.977011494252873, "step": 5460}, {"loss": 0.4171, "grad_norm": 1.096198320388794, "learning_rate": 0.0002, "epoch": 5.987958401751505, "step": 5470}, {"loss": 0.3886, "grad_norm": 1.2601423263549805, "learning_rate": 0.0002, "epoch": 5.998905309250137, "step": 5480}, {"eval_loss": 1.611358880996704, "eval_runtime": 46.0638, "eval_samples_per_second": 9.465, "eval_steps_per_second": 1.194, "epoch": 6.0, "step": 5481}, {"loss": 0.2616, "grad_norm": 0.9854364991188049, "learning_rate": 0.0002, "epoch": 6.009852216748769, "step": 5490}, {"loss": 0.2412, "grad_norm": 1.8073689937591553, "learning_rate": 0.0002, "epoch": 6.0207991242474, "step": 5500}, {"loss": 0.2317, "grad_norm": 1.1852164268493652, "learning_rate": 0.0002, "epoch": 6.031746031746032, "step": 5510}, {"loss": 0.224, "grad_norm": 1.0937914848327637, "learning_rate": 0.0002, "epoch": 6.042692939244663, "step": 5520}, {"loss": 0.2473, "grad_norm": 0.7411194443702698, "learning_rate": 0.0002, "epoch": 6.053639846743295, "step": 5530}, {"loss": 0.2846, "grad_norm": 1.552127480506897, "learning_rate": 0.0002, "epoch": 6.064586754241927, "step": 5540}, {"loss": 0.2639, "grad_norm": 1.0465604066848755, "learning_rate": 0.0002, "epoch": 6.075533661740558, "step": 5550}, {"loss": 0.2696, "grad_norm": 1.4008121490478516, "learning_rate": 0.0002, "epoch": 6.08648056923919, "step": 5560}, {"loss": 0.3049, "grad_norm": 1.7049046754837036, "learning_rate": 0.0002, "epoch": 6.097427476737821, "step": 5570}, {"loss": 0.263, "grad_norm": 1.111151933670044, "learning_rate": 0.0002, "epoch": 6.108374384236453, "step": 5580}, {"loss": 0.2816, "grad_norm": 1.4271087646484375, "learning_rate": 0.0002, "epoch": 6.119321291735085, "step": 5590}, {"loss": 0.2878, "grad_norm": 1.3917373418807983, "learning_rate": 0.0002, "epoch": 6.130268199233717, "step": 5600}, {"loss": 0.2482, "grad_norm": 1.013689637184143, "learning_rate": 0.0002, "epoch": 6.141215106732348, "step": 5610}, {"loss": 0.2841, "grad_norm": 1.342645525932312, "learning_rate": 0.0002, "epoch": 6.15216201423098, "step": 5620}, {"loss": 0.2335, "grad_norm": 1.4480562210083008, "learning_rate": 0.0002, "epoch": 6.163108921729611, "step": 5630}, {"loss": 0.2696, "grad_norm": 1.2483175992965698, "learning_rate": 0.0002, "epoch": 6.174055829228243, "step": 5640}, {"loss": 0.2656, "grad_norm": 1.2944550514221191, "learning_rate": 0.0002, "epoch": 6.185002736726875, "step": 5650}, {"loss": 0.2704, "grad_norm": 1.264142632484436, "learning_rate": 0.0002, "epoch": 6.195949644225506, "step": 5660}, {"loss": 0.2971, "grad_norm": 1.2068781852722168, "learning_rate": 0.0002, "epoch": 6.206896551724138, "step": 5670}, {"loss": 0.2882, "grad_norm": 1.0401629209518433, "learning_rate": 0.0002, "epoch": 6.217843459222769, "step": 5680}, {"loss": 0.3022, "grad_norm": 1.2054402828216553, "learning_rate": 0.0002, "epoch": 6.228790366721401, "step": 5690}, {"loss": 0.2949, "grad_norm": 1.1278687715530396, "learning_rate": 0.0002, "epoch": 6.239737274220033, "step": 5700}, {"loss": 0.2477, "grad_norm": 1.24592125415802, "learning_rate": 0.0002, "epoch": 6.250684181718665, "step": 5710}, {"loss": 0.246, "grad_norm": 1.2686697244644165, "learning_rate": 0.0002, "epoch": 6.261631089217296, "step": 5720}, {"loss": 0.2974, "grad_norm": 1.1836518049240112, "learning_rate": 0.0002, "epoch": 6.272577996715928, "step": 5730}, {"loss": 0.2963, "grad_norm": 1.387752890586853, "learning_rate": 0.0002, "epoch": 6.283524904214559, "step": 5740}, {"loss": 0.2961, "grad_norm": 1.9390363693237305, "learning_rate": 0.0002, "epoch": 6.294471811713191, "step": 5750}, {"loss": 0.2765, "grad_norm": 1.2919824123382568, "learning_rate": 0.0002, "epoch": 6.305418719211823, "step": 5760}, {"loss": 0.2898, "grad_norm": 1.2793965339660645, "learning_rate": 0.0002, "epoch": 6.316365626710454, "step": 5770}, {"loss": 0.2786, "grad_norm": 1.5486980676651, "learning_rate": 0.0002, "epoch": 6.327312534209086, "step": 5780}, {"loss": 0.2684, "grad_norm": 1.2757408618927002, "learning_rate": 0.0002, "epoch": 6.338259441707717, "step": 5790}, {"loss": 0.2841, "grad_norm": 1.3245713710784912, "learning_rate": 0.0002, "epoch": 6.349206349206349, "step": 5800}, {"loss": 0.3096, "grad_norm": 1.6262527704238892, "learning_rate": 0.0002, "epoch": 6.360153256704981, "step": 5810}, {"loss": 0.3219, "grad_norm": 1.465224027633667, "learning_rate": 0.0002, "epoch": 6.371100164203613, "step": 5820}, {"loss": 0.2703, "grad_norm": 1.437408447265625, "learning_rate": 0.0002, "epoch": 6.382047071702244, "step": 5830}, {"loss": 0.3012, "grad_norm": 1.3094626665115356, "learning_rate": 0.0002, "epoch": 6.392993979200876, "step": 5840}, {"loss": 0.2991, "grad_norm": 1.6717544794082642, "learning_rate": 0.0002, "epoch": 6.403940886699507, "step": 5850}, {"loss": 0.2892, "grad_norm": 1.1023344993591309, "learning_rate": 0.0002, "epoch": 6.414887794198139, "step": 5860}, {"loss": 0.3078, "grad_norm": 1.2397106885910034, "learning_rate": 0.0002, "epoch": 6.425834701696771, "step": 5870}, {"loss": 0.2984, "grad_norm": 1.6139185428619385, "learning_rate": 0.0002, "epoch": 6.436781609195402, "step": 5880}, {"loss": 0.2353, "grad_norm": 1.3164576292037964, "learning_rate": 0.0002, "epoch": 6.447728516694034, "step": 5890}, {"loss": 0.2772, "grad_norm": 1.3317217826843262, "learning_rate": 0.0002, "epoch": 6.458675424192665, "step": 5900}, {"loss": 0.2555, "grad_norm": 1.215008020401001, "learning_rate": 0.0002, "epoch": 6.469622331691297, "step": 5910}, {"loss": 0.2715, "grad_norm": 1.625672698020935, "learning_rate": 0.0002, "epoch": 6.480569239189929, "step": 5920}, {"loss": 0.2938, "grad_norm": 1.1262489557266235, "learning_rate": 0.0002, "epoch": 6.491516146688561, "step": 5930}, {"loss": 0.2921, "grad_norm": 1.447100281715393, "learning_rate": 0.0002, "epoch": 6.502463054187192, "step": 5940}, {"loss": 0.3059, "grad_norm": 1.3306448459625244, "learning_rate": 0.0002, "epoch": 6.513409961685824, "step": 5950}, {"loss": 0.2922, "grad_norm": 1.307732105255127, "learning_rate": 0.0002, "epoch": 6.524356869184455, "step": 5960}, {"loss": 0.2891, "grad_norm": 1.1851097345352173, "learning_rate": 0.0002, "epoch": 6.535303776683087, "step": 5970}, {"loss": 0.2859, "grad_norm": 1.462816596031189, "learning_rate": 0.0002, "epoch": 6.546250684181719, "step": 5980}, {"loss": 0.2698, "grad_norm": 1.2324728965759277, "learning_rate": 0.0002, "epoch": 6.55719759168035, "step": 5990}, {"loss": 0.2672, "grad_norm": 1.3627429008483887, "learning_rate": 0.0002, "epoch": 6.568144499178982, "step": 6000}, {"loss": 0.3182, "grad_norm": 1.94977867603302, "learning_rate": 0.0002, "epoch": 6.579091406677613, "step": 6010}, {"loss": 0.3183, "grad_norm": 1.459844946861267, "learning_rate": 0.0002, "epoch": 6.590038314176245, "step": 6020}, {"loss": 0.3142, "grad_norm": 1.4454325437545776, "learning_rate": 0.0002, "epoch": 6.600985221674877, "step": 6030}, {"loss": 0.269, "grad_norm": 1.4245165586471558, "learning_rate": 0.0002, "epoch": 6.611932129173509, "step": 6040}, {"loss": 0.3041, "grad_norm": 1.195803165435791, "learning_rate": 0.0002, "epoch": 6.62287903667214, "step": 6050}, {"loss": 0.3075, "grad_norm": 1.3589898347854614, "learning_rate": 0.0002, "epoch": 6.633825944170772, "step": 6060}, {"loss": 0.3291, "grad_norm": 1.3488036394119263, "learning_rate": 0.0002, "epoch": 6.644772851669403, "step": 6070}, {"loss": 0.2898, "grad_norm": 1.0954102277755737, "learning_rate": 0.0002, "epoch": 6.655719759168035, "step": 6080}, {"loss": 0.3489, "grad_norm": 1.4431062936782837, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 6090}, {"loss": 0.2816, "grad_norm": 1.4387465715408325, "learning_rate": 0.0002, "epoch": 6.677613574165298, "step": 6100}, {"loss": 0.2705, "grad_norm": 1.8398990631103516, "learning_rate": 0.0002, "epoch": 6.68856048166393, "step": 6110}, {"loss": 0.3214, "grad_norm": 1.3523273468017578, "learning_rate": 0.0002, "epoch": 6.699507389162561, "step": 6120}, {"loss": 0.287, "grad_norm": 1.6326191425323486, "learning_rate": 0.0002, "epoch": 6.710454296661193, "step": 6130}, {"loss": 0.2857, "grad_norm": 1.3677960634231567, "learning_rate": 0.0002, "epoch": 6.721401204159825, "step": 6140}, {"loss": 0.3264, "grad_norm": 1.1993201971054077, "learning_rate": 0.0002, "epoch": 6.732348111658457, "step": 6150}, {"loss": 0.3071, "grad_norm": 1.1864078044891357, "learning_rate": 0.0002, "epoch": 6.743295019157088, "step": 6160}, {"loss": 0.3087, "grad_norm": 1.1625522375106812, "learning_rate": 0.0002, "epoch": 6.75424192665572, "step": 6170}, {"loss": 0.3551, "grad_norm": 1.5803234577178955, "learning_rate": 0.0002, "epoch": 6.765188834154351, "step": 6180}, {"loss": 0.3059, "grad_norm": 1.151746153831482, "learning_rate": 0.0002, "epoch": 6.776135741652983, "step": 6190}, {"loss": 0.2697, "grad_norm": 1.0727161169052124, "learning_rate": 0.0002, "epoch": 6.787082649151615, "step": 6200}, {"loss": 0.2844, "grad_norm": 1.4148162603378296, "learning_rate": 0.0002, "epoch": 6.798029556650246, "step": 6210}, {"loss": 0.3417, "grad_norm": 1.2071447372436523, "learning_rate": 0.0002, "epoch": 6.808976464148878, "step": 6220}, {"loss": 0.3066, "grad_norm": 1.3843804597854614, "learning_rate": 0.0002, "epoch": 6.819923371647509, "step": 6230}, {"loss": 0.2769, "grad_norm": 1.2490662336349487, "learning_rate": 0.0002, "epoch": 6.830870279146141, "step": 6240}, {"loss": 0.3237, "grad_norm": 1.6029689311981201, "learning_rate": 0.0002, "epoch": 6.841817186644773, "step": 6250}, {"loss": 0.3152, "grad_norm": 1.0388455390930176, "learning_rate": 0.0002, "epoch": 6.852764094143405, "step": 6260}, {"loss": 0.3026, "grad_norm": 1.3883857727050781, "learning_rate": 0.0002, "epoch": 6.863711001642036, "step": 6270}, {"loss": 0.3175, "grad_norm": 1.0500187873840332, "learning_rate": 0.0002, "epoch": 6.874657909140668, "step": 6280}, {"loss": 0.2952, "grad_norm": 1.4243487119674683, "learning_rate": 0.0002, "epoch": 6.885604816639299, "step": 6290}, {"loss": 0.2679, "grad_norm": 1.3169665336608887, "learning_rate": 0.0002, "epoch": 6.896551724137931, "step": 6300}, {"loss": 0.3291, "grad_norm": 1.5261493921279907, "learning_rate": 0.0002, "epoch": 6.907498631636563, "step": 6310}, {"loss": 0.3344, "grad_norm": 1.578403115272522, "learning_rate": 0.0002, "epoch": 6.9184455391351944, "step": 6320}, {"loss": 0.3263, "grad_norm": 1.4093263149261475, "learning_rate": 0.0002, "epoch": 6.929392446633826, "step": 6330}, {"loss": 0.3396, "grad_norm": 1.4003552198410034, "learning_rate": 0.0002, "epoch": 6.940339354132457, "step": 6340}, {"loss": 0.3476, "grad_norm": 1.650190830230713, "learning_rate": 0.0002, "epoch": 6.951286261631089, "step": 6350}, {"loss": 0.3442, "grad_norm": 1.2314515113830566, "learning_rate": 0.0002, "epoch": 6.962233169129721, "step": 6360}, {"loss": 0.3341, "grad_norm": 1.270980954170227, "learning_rate": 0.0002, "epoch": 6.973180076628353, "step": 6370}, {"loss": 0.3425, "grad_norm": 1.6352545022964478, "learning_rate": 0.0002, "epoch": 6.984126984126984, "step": 6380}, {"loss": 0.3647, "grad_norm": 1.3744925260543823, "learning_rate": 0.0002, "epoch": 6.995073891625616, "step": 6390}, {"eval_loss": 1.756764531135559, "eval_runtime": 46.0542, "eval_samples_per_second": 9.467, "eval_steps_per_second": 1.194, "epoch": 6.999452654625069, "step": 6394}, {"loss": 0.2356, "grad_norm": 0.856991171836853, "learning_rate": 0.0002, "epoch": 7.006020799124247, "step": 6400}, {"loss": 0.2138, "grad_norm": 0.9483422040939331, "learning_rate": 0.0002, "epoch": 7.016967706622879, "step": 6410}, {"loss": 0.1884, "grad_norm": 1.0703433752059937, "learning_rate": 0.0002, "epoch": 7.027914614121511, "step": 6420}, {"loss": 0.2088, "grad_norm": 1.761413812637329, "learning_rate": 0.0002, "epoch": 7.0388615216201424, "step": 6430}, {"loss": 0.2156, "grad_norm": 0.9484238028526306, "learning_rate": 0.0002, "epoch": 7.049808429118774, "step": 6440}, {"loss": 0.2323, "grad_norm": 1.5663186311721802, "learning_rate": 0.0002, "epoch": 7.060755336617405, "step": 6450}, {"loss": 0.2089, "grad_norm": 0.7692174315452576, "learning_rate": 0.0002, "epoch": 7.071702244116037, "step": 6460}, {"loss": 0.1856, "grad_norm": 1.3554800748825073, "learning_rate": 0.0002, "epoch": 7.082649151614669, "step": 6470}, {"loss": 0.2057, "grad_norm": 0.9705919027328491, "learning_rate": 0.0002, "epoch": 7.093596059113301, "step": 6480}, {"loss": 0.2068, "grad_norm": 1.355778694152832, "learning_rate": 0.0002, "epoch": 7.104542966611932, "step": 6490}, {"loss": 0.2021, "grad_norm": 1.5389477014541626, "learning_rate": 0.0002, "epoch": 7.115489874110564, "step": 6500}, {"loss": 0.1963, "grad_norm": 0.9565434455871582, "learning_rate": 0.0002, "epoch": 7.126436781609195, "step": 6510}, {"loss": 0.2323, "grad_norm": 1.101539134979248, "learning_rate": 0.0002, "epoch": 7.137383689107827, "step": 6520}, {"loss": 0.2024, "grad_norm": 0.925153374671936, "learning_rate": 0.0002, "epoch": 7.148330596606459, "step": 6530}, {"loss": 0.1955, "grad_norm": 1.1609078645706177, "learning_rate": 0.0002, "epoch": 7.1592775041050905, "step": 6540}, {"loss": 0.2022, "grad_norm": 0.8908484578132629, "learning_rate": 0.0002, "epoch": 7.170224411603722, "step": 6550}, {"loss": 0.2378, "grad_norm": 0.9066158533096313, "learning_rate": 0.0002, "epoch": 7.181171319102353, "step": 6560}, {"loss": 0.1955, "grad_norm": 1.3601553440093994, "learning_rate": 0.0002, "epoch": 7.192118226600985, "step": 6570}, {"loss": 0.241, "grad_norm": 1.0034444332122803, "learning_rate": 0.0002, "epoch": 7.203065134099617, "step": 6580}, {"loss": 0.2134, "grad_norm": 1.608299970626831, "learning_rate": 0.0002, "epoch": 7.214012041598249, "step": 6590}, {"loss": 0.2089, "grad_norm": 1.2889668941497803, "learning_rate": 0.0002, "epoch": 7.22495894909688, "step": 6600}, {"loss": 0.2405, "grad_norm": 0.9896159768104553, "learning_rate": 0.0002, "epoch": 7.235905856595512, "step": 6610}, {"loss": 0.2091, "grad_norm": 1.408511996269226, "learning_rate": 0.0002, "epoch": 7.246852764094143, "step": 6620}, {"loss": 0.1958, "grad_norm": 1.0823664665222168, "learning_rate": 0.0002, "epoch": 7.257799671592775, "step": 6630}, {"loss": 0.2117, "grad_norm": 1.027026891708374, "learning_rate": 0.0002, "epoch": 7.268746579091407, "step": 6640}, {"loss": 0.2319, "grad_norm": 1.0922648906707764, "learning_rate": 0.0002, "epoch": 7.2796934865900385, "step": 6650}, {"loss": 0.2367, "grad_norm": 1.3361082077026367, "learning_rate": 0.0002, "epoch": 7.29064039408867, "step": 6660}, {"loss": 0.2299, "grad_norm": 1.9565683603286743, "learning_rate": 0.0002, "epoch": 7.301587301587301, "step": 6670}, {"loss": 0.2248, "grad_norm": 1.413672685623169, "learning_rate": 0.0002, "epoch": 7.312534209085933, "step": 6680}, {"loss": 0.2306, "grad_norm": 1.121842384338379, "learning_rate": 0.0002, "epoch": 7.323481116584565, "step": 6690}, {"loss": 0.2222, "grad_norm": 1.0622057914733887, "learning_rate": 0.0002, "epoch": 7.334428024083197, "step": 6700}, {"loss": 0.2387, "grad_norm": 1.280921459197998, "learning_rate": 0.0002, "epoch": 7.345374931581828, "step": 6710}, {"loss": 0.2428, "grad_norm": 1.5295953750610352, "learning_rate": 0.0002, "epoch": 7.35632183908046, "step": 6720}, {"loss": 0.2149, "grad_norm": 1.4289230108261108, "learning_rate": 0.0002, "epoch": 7.367268746579091, "step": 6730}, {"loss": 0.2172, "grad_norm": 1.535111665725708, "learning_rate": 0.0002, "epoch": 7.378215654077723, "step": 6740}, {"loss": 0.2262, "grad_norm": 1.777826189994812, "learning_rate": 0.0002, "epoch": 7.389162561576355, "step": 6750}, {"loss": 0.2246, "grad_norm": 1.5058139562606812, "learning_rate": 0.0002, "epoch": 7.4001094690749865, "step": 6760}, {"loss": 0.2323, "grad_norm": 0.9381663203239441, "learning_rate": 0.0002, "epoch": 7.411056376573618, "step": 6770}, {"loss": 0.2078, "grad_norm": 1.4739434719085693, "learning_rate": 0.0002, "epoch": 7.422003284072249, "step": 6780}, {"loss": 0.2493, "grad_norm": 1.8703559637069702, "learning_rate": 0.0002, "epoch": 7.432950191570881, "step": 6790}, {"loss": 0.2554, "grad_norm": 1.2242027521133423, "learning_rate": 0.0002, "epoch": 7.443897099069513, "step": 6800}, {"loss": 0.2258, "grad_norm": 1.3950374126434326, "learning_rate": 0.0002, "epoch": 7.454844006568145, "step": 6810}, {"loss": 0.2365, "grad_norm": 1.461701512336731, "learning_rate": 0.0002, "epoch": 7.465790914066776, "step": 6820}, {"loss": 0.2302, "grad_norm": 1.4460340738296509, "learning_rate": 0.0002, "epoch": 7.476737821565408, "step": 6830}, {"loss": 0.2294, "grad_norm": 1.0341510772705078, "learning_rate": 0.0002, "epoch": 7.487684729064039, "step": 6840}, {"loss": 0.2338, "grad_norm": 0.8885145783424377, "learning_rate": 0.0002, "epoch": 7.498631636562671, "step": 6850}, {"loss": 0.2424, "grad_norm": 2.4326062202453613, "learning_rate": 0.0002, "epoch": 7.509578544061303, "step": 6860}, {"loss": 0.2352, "grad_norm": 1.1390372514724731, "learning_rate": 0.0002, "epoch": 7.5205254515599345, "step": 6870}, {"loss": 0.2184, "grad_norm": 1.2346464395523071, "learning_rate": 0.0002, "epoch": 7.531472359058566, "step": 6880}, {"loss": 0.2389, "grad_norm": 1.6705836057662964, "learning_rate": 0.0002, "epoch": 7.542419266557197, "step": 6890}, {"loss": 0.2346, "grad_norm": 0.8130379319190979, "learning_rate": 0.0002, "epoch": 7.553366174055829, "step": 6900}, {"loss": 0.2165, "grad_norm": 1.2974088191986084, "learning_rate": 0.0002, "epoch": 7.564313081554461, "step": 6910}, {"loss": 0.2328, "grad_norm": 1.3465348482131958, "learning_rate": 0.0002, "epoch": 7.575259989053093, "step": 6920}, {"loss": 0.2789, "grad_norm": 1.245126724243164, "learning_rate": 0.0002, "epoch": 7.586206896551724, "step": 6930}, {"loss": 0.2446, "grad_norm": 1.3736917972564697, "learning_rate": 0.0002, "epoch": 7.597153804050356, "step": 6940}, {"loss": 0.2483, "grad_norm": 1.340989351272583, "learning_rate": 0.0002, "epoch": 7.608100711548987, "step": 6950}, {"loss": 0.3014, "grad_norm": 1.1082850694656372, "learning_rate": 0.0002, "epoch": 7.619047619047619, "step": 6960}, {"loss": 0.2319, "grad_norm": 1.3829188346862793, "learning_rate": 0.0002, "epoch": 7.629994526546251, "step": 6970}, {"loss": 0.2299, "grad_norm": 1.5384989976882935, "learning_rate": 0.0002, "epoch": 7.6409414340448825, "step": 6980}, {"loss": 0.2457, "grad_norm": 1.1061540842056274, "learning_rate": 0.0002, "epoch": 7.651888341543514, "step": 6990}, {"loss": 0.2216, "grad_norm": 1.2673815488815308, "learning_rate": 0.0002, "epoch": 7.662835249042145, "step": 7000}, {"loss": 0.2397, "grad_norm": 1.2290737628936768, "learning_rate": 0.0002, "epoch": 7.673782156540777, "step": 7010}, {"loss": 0.2112, "grad_norm": 1.4055291414260864, "learning_rate": 0.0002, "epoch": 7.684729064039409, "step": 7020}, {"loss": 0.2548, "grad_norm": 1.7786750793457031, "learning_rate": 0.0002, "epoch": 7.695675971538041, "step": 7030}, {"loss": 0.2241, "grad_norm": 1.454209566116333, "learning_rate": 0.0002, "epoch": 7.706622879036672, "step": 7040}, {"loss": 0.2461, "grad_norm": 1.3995633125305176, "learning_rate": 0.0002, "epoch": 7.717569786535304, "step": 7050}, {"loss": 0.2785, "grad_norm": 1.7514715194702148, "learning_rate": 0.0002, "epoch": 7.728516694033935, "step": 7060}, {"loss": 0.2335, "grad_norm": 1.5538004636764526, "learning_rate": 0.0002, "epoch": 7.739463601532567, "step": 7070}, {"loss": 0.2245, "grad_norm": 1.122506856918335, "learning_rate": 0.0002, "epoch": 7.750410509031199, "step": 7080}, {"loss": 0.2473, "grad_norm": 1.2445831298828125, "learning_rate": 0.0002, "epoch": 7.7613574165298305, "step": 7090}, {"loss": 0.2421, "grad_norm": 1.1478949785232544, "learning_rate": 0.0002, "epoch": 7.772304324028462, "step": 7100}, {"loss": 0.2346, "grad_norm": 1.4352518320083618, "learning_rate": 0.0002, "epoch": 7.783251231527093, "step": 7110}, {"loss": 0.2351, "grad_norm": 1.511096715927124, "learning_rate": 0.0002, "epoch": 7.794198139025725, "step": 7120}, {"loss": 0.2586, "grad_norm": 1.2296271324157715, "learning_rate": 0.0002, "epoch": 7.805145046524357, "step": 7130}, {"loss": 0.2453, "grad_norm": 1.7886443138122559, "learning_rate": 0.0002, "epoch": 7.816091954022989, "step": 7140}, {"loss": 0.2645, "grad_norm": 1.8886322975158691, "learning_rate": 0.0002, "epoch": 7.82703886152162, "step": 7150}, {"loss": 0.2658, "grad_norm": 1.3493725061416626, "learning_rate": 0.0002, "epoch": 7.837985769020252, "step": 7160}, {"loss": 0.2787, "grad_norm": 1.379209041595459, "learning_rate": 0.0002, "epoch": 7.848932676518883, "step": 7170}, {"loss": 0.248, "grad_norm": 0.9374330043792725, "learning_rate": 0.0002, "epoch": 7.859879584017515, "step": 7180}, {"loss": 0.2412, "grad_norm": 1.0391291379928589, "learning_rate": 0.0002, "epoch": 7.870826491516147, "step": 7190}, {"loss": 0.2648, "grad_norm": 1.2710281610488892, "learning_rate": 0.0002, "epoch": 7.8817733990147785, "step": 7200}, {"loss": 0.2511, "grad_norm": 1.6858662366867065, "learning_rate": 0.0002, "epoch": 7.89272030651341, "step": 7210}, {"loss": 0.2667, "grad_norm": 1.0925853252410889, "learning_rate": 0.0002, "epoch": 7.903667214012041, "step": 7220}, {"loss": 0.2756, "grad_norm": 1.7404073476791382, "learning_rate": 0.0002, "epoch": 7.914614121510673, "step": 7230}, {"loss": 0.2486, "grad_norm": 1.284067153930664, "learning_rate": 0.0002, "epoch": 7.925561029009305, "step": 7240}, {"loss": 0.2394, "grad_norm": 1.3801543712615967, "learning_rate": 0.0002, "epoch": 7.936507936507937, "step": 7250}, {"loss": 0.2761, "grad_norm": 1.4068974256515503, "learning_rate": 0.0002, "epoch": 7.947454844006568, "step": 7260}, {"loss": 0.2455, "grad_norm": 1.770037055015564, "learning_rate": 0.0002, "epoch": 7.9584017515052, "step": 7270}, {"loss": 0.2863, "grad_norm": 1.473775029182434, "learning_rate": 0.0002, "epoch": 7.969348659003831, "step": 7280}, {"loss": 0.2722, "grad_norm": 1.4878343343734741, "learning_rate": 0.0002, "epoch": 7.980295566502463, "step": 7290}, {"loss": 0.2921, "grad_norm": 1.2178987264633179, "learning_rate": 0.0002, "epoch": 7.991242474001095, "step": 7300}]}