diff --git a/.gitattributes b/.gitattributes index 72b295fb431f2294183b2bacf217c52b0ce4a5e4..eb37209c874bce2f3e6e4d5e50aa1c4050416f37 100644 --- a/.gitattributes +++ b/.gitattributes @@ -397,3 +397,12 @@ Meta-Llama-3-8B-Instruct_int4_winogrande-routerbench-0shot_lr-0.0002_e-8_seq-512 Qwen2-7B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-89582-sd-1/checkpoint-11920/tokenizer.json filter=lfs diff=lfs merge=lfs -text Qwen2-7B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-89582-sd-1/checkpoint-23840/tokenizer.json filter=lfs diff=lfs merge=lfs -text Qwen2-7B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-89582-sd-1/checkpoint-35760/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6736da95acf74ac199158de2bcd2f45e279c6a48 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38b9d4be16ebe4fd6dec61f83710d425e98161ba60be90a89bdaeb9c11beb3ef +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..772f31cf2e416510d41cafab8c761d209cefcd08 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:deb96632b42907769e925b98a199f6fcdc0a7831f82dff2e4906d156ed0cb2a6 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4fb2328c3d69925f2df5bd45759191492150b200 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c03c76ae0896a0881fe52bd3b7a6e82c8932690a2e4f43c1a9dbbe48aaa334d6 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..855aac6548d56b320fc19c4b5d0eb6086d47ad28 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:056b2e3553bda71d84a70bc08025bb619086e3c0102977dd22d36e089f6403b6 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6f216c1dabc2a6822f7dd858f323c56da9bb201 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cba4916bdb6b8f76363dde603abe1017bf6994aa96373586eaba8b95fa562bfd +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2be701027ddef62ed06450c07b5aa2ca2b4edf6a --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/trainer_state.json @@ -0,0 +1,7307 @@ +{ + "best_metric": 1.8068748712539673, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577", + "epoch": 7.996895615056267, + "eval_steps": 10, + "global_step": 10304, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007760962359332557, + "grad_norm": 1.0751162767410278, + "learning_rate": 0.0002, + "loss": 3.0855, + "step": 10 + }, + { + "epoch": 0.015521924718665115, + "grad_norm": 0.4697345793247223, + "learning_rate": 0.0002, + "loss": 2.4744, + "step": 20 + }, + { + "epoch": 0.023282887077997673, + "grad_norm": 0.5370839238166809, + "learning_rate": 0.0002, + "loss": 2.193, + "step": 30 + }, + { + "epoch": 0.03104384943733023, + "grad_norm": 0.46794816851615906, + "learning_rate": 0.0002, + "loss": 2.0599, + "step": 40 + }, + { + "epoch": 0.038804811796662786, + "grad_norm": 0.44624820351600647, + "learning_rate": 0.0002, + "loss": 1.9354, + "step": 50 + }, + { + "epoch": 0.046565774155995346, + "grad_norm": 0.3953201472759247, + "learning_rate": 0.0002, + "loss": 1.9319, + "step": 60 + }, + { + "epoch": 0.0543267365153279, + "grad_norm": 0.3935912549495697, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 70 + }, + { + "epoch": 0.06208769887466046, + "grad_norm": 0.4520699381828308, + "learning_rate": 0.0002, + "loss": 1.8795, + "step": 80 + }, + { + "epoch": 0.06984866123399301, + "grad_norm": 0.3801847994327545, + "learning_rate": 0.0002, + "loss": 1.8354, + "step": 90 + }, + { + "epoch": 0.07760962359332557, + "grad_norm": 0.4020165205001831, + "learning_rate": 0.0002, + "loss": 1.9053, + "step": 100 + }, + { + "epoch": 0.08537058595265813, + "grad_norm": 0.3860672116279602, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 110 + }, + { + "epoch": 0.09313154831199069, + "grad_norm": 0.3681113123893738, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 120 + }, + { + "epoch": 0.10089251067132324, + "grad_norm": 0.3594866991043091, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 130 + }, + { + "epoch": 0.1086534730306558, + "grad_norm": 0.3879193663597107, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 140 + }, + { + "epoch": 0.11641443538998836, + "grad_norm": 0.3270505666732788, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 150 + }, + { + "epoch": 0.12417539774932092, + "grad_norm": 0.36824458837509155, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 160 + }, + { + "epoch": 0.13193636010865348, + "grad_norm": 0.383882075548172, + "learning_rate": 0.0002, + "loss": 1.8305, + "step": 170 + }, + { + "epoch": 0.13969732246798602, + "grad_norm": 0.3368665874004364, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 180 + }, + { + "epoch": 0.1474582848273186, + "grad_norm": 0.35961097478866577, + "learning_rate": 0.0002, + "loss": 1.7882, + "step": 190 + }, + { + "epoch": 0.15521924718665114, + "grad_norm": 0.3415963351726532, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 200 + }, + { + "epoch": 0.1629802095459837, + "grad_norm": 0.4100632071495056, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 210 + }, + { + "epoch": 0.17074117190531626, + "grad_norm": 0.3516307473182678, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 220 + }, + { + "epoch": 0.1785021342646488, + "grad_norm": 0.37919050455093384, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 230 + }, + { + "epoch": 0.18626309662398138, + "grad_norm": 0.33270683884620667, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 240 + }, + { + "epoch": 0.19402405898331393, + "grad_norm": 0.3348783254623413, + "learning_rate": 0.0002, + "loss": 1.7925, + "step": 250 + }, + { + "epoch": 0.20178502134264648, + "grad_norm": 0.3888475298881531, + "learning_rate": 0.0002, + "loss": 1.7774, + "step": 260 + }, + { + "epoch": 0.20954598370197905, + "grad_norm": 0.3554602861404419, + "learning_rate": 0.0002, + "loss": 1.8381, + "step": 270 + }, + { + "epoch": 0.2173069460613116, + "grad_norm": 0.33277708292007446, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 280 + }, + { + "epoch": 0.22506790842064417, + "grad_norm": 0.3281584680080414, + "learning_rate": 0.0002, + "loss": 1.7713, + "step": 290 + }, + { + "epoch": 0.23282887077997672, + "grad_norm": 0.3185969591140747, + "learning_rate": 0.0002, + "loss": 1.8181, + "step": 300 + }, + { + "epoch": 0.24058983313930926, + "grad_norm": 0.35335442423820496, + "learning_rate": 0.0002, + "loss": 1.8595, + "step": 310 + }, + { + "epoch": 0.24835079549864184, + "grad_norm": 0.3119595944881439, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 320 + }, + { + "epoch": 0.2561117578579744, + "grad_norm": 0.36424458026885986, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 330 + }, + { + "epoch": 0.26387272021730696, + "grad_norm": 0.3618951141834259, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 340 + }, + { + "epoch": 0.2716336825766395, + "grad_norm": 0.312757670879364, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 350 + }, + { + "epoch": 0.27939464493597205, + "grad_norm": 0.326016366481781, + "learning_rate": 0.0002, + "loss": 1.9031, + "step": 360 + }, + { + "epoch": 0.2871556072953046, + "grad_norm": 0.34093883633613586, + "learning_rate": 0.0002, + "loss": 1.8214, + "step": 370 + }, + { + "epoch": 0.2949165696546372, + "grad_norm": 0.32325029373168945, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 380 + }, + { + "epoch": 0.30267753201396974, + "grad_norm": 0.34105437994003296, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 390 + }, + { + "epoch": 0.3104384943733023, + "grad_norm": 0.32565295696258545, + "learning_rate": 0.0002, + "loss": 1.7926, + "step": 400 + }, + { + "epoch": 0.31819945673263483, + "grad_norm": 0.32742050290107727, + "learning_rate": 0.0002, + "loss": 1.8031, + "step": 410 + }, + { + "epoch": 0.3259604190919674, + "grad_norm": 0.30233046412467957, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 420 + }, + { + "epoch": 0.3337213814513, + "grad_norm": 0.32419222593307495, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 430 + }, + { + "epoch": 0.3414823438106325, + "grad_norm": 0.3653007745742798, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 440 + }, + { + "epoch": 0.3492433061699651, + "grad_norm": 0.31617099046707153, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 450 + }, + { + "epoch": 0.3570042685292976, + "grad_norm": 0.3305962085723877, + "learning_rate": 0.0002, + "loss": 1.7677, + "step": 460 + }, + { + "epoch": 0.36476523088863017, + "grad_norm": 0.3178933262825012, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 470 + }, + { + "epoch": 0.37252619324796277, + "grad_norm": 0.37163782119750977, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 480 + }, + { + "epoch": 0.3802871556072953, + "grad_norm": 0.469844788312912, + "learning_rate": 0.0002, + "loss": 1.8804, + "step": 490 + }, + { + "epoch": 0.38804811796662786, + "grad_norm": 0.3409338593482971, + "learning_rate": 0.0002, + "loss": 1.8343, + "step": 500 + }, + { + "epoch": 0.3958090803259604, + "grad_norm": 0.31943467259407043, + "learning_rate": 0.0002, + "loss": 1.8433, + "step": 510 + }, + { + "epoch": 0.40357004268529295, + "grad_norm": 0.32293614745140076, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 520 + }, + { + "epoch": 0.41133100504462555, + "grad_norm": 0.2994382977485657, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 530 + }, + { + "epoch": 0.4190919674039581, + "grad_norm": 0.3273141384124756, + "learning_rate": 0.0002, + "loss": 1.8153, + "step": 540 + }, + { + "epoch": 0.42685292976329064, + "grad_norm": 0.3020550012588501, + "learning_rate": 0.0002, + "loss": 1.8097, + "step": 550 + }, + { + "epoch": 0.4346138921226232, + "grad_norm": 0.30113112926483154, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 560 + }, + { + "epoch": 0.44237485448195574, + "grad_norm": 0.30274903774261475, + "learning_rate": 0.0002, + "loss": 1.8084, + "step": 570 + }, + { + "epoch": 0.45013581684128834, + "grad_norm": 0.3231128454208374, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 580 + }, + { + "epoch": 0.4578967792006209, + "grad_norm": 0.3255121409893036, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 590 + }, + { + "epoch": 0.46565774155995343, + "grad_norm": 0.30147507786750793, + "learning_rate": 0.0002, + "loss": 1.8227, + "step": 600 + }, + { + "epoch": 0.473418703919286, + "grad_norm": 0.29781386256217957, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 610 + }, + { + "epoch": 0.4811796662786185, + "grad_norm": 0.30914685130119324, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 620 + }, + { + "epoch": 0.4889406286379511, + "grad_norm": 0.3110593855381012, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 630 + }, + { + "epoch": 0.49670159099728367, + "grad_norm": 0.3298132121562958, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 640 + }, + { + "epoch": 0.5044625533566163, + "grad_norm": 0.322122186422348, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 650 + }, + { + "epoch": 0.5122235157159488, + "grad_norm": 0.3504371643066406, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 660 + }, + { + "epoch": 0.5199844780752814, + "grad_norm": 0.3102182149887085, + "learning_rate": 0.0002, + "loss": 1.8682, + "step": 670 + }, + { + "epoch": 0.5277454404346139, + "grad_norm": 0.6113658547401428, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 680 + }, + { + "epoch": 0.5355064027939465, + "grad_norm": 0.31841862201690674, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 690 + }, + { + "epoch": 0.543267365153279, + "grad_norm": 0.2830526530742645, + "learning_rate": 0.0002, + "loss": 1.7587, + "step": 700 + }, + { + "epoch": 0.5510283275126115, + "grad_norm": 0.3048769533634186, + "learning_rate": 0.0002, + "loss": 1.7887, + "step": 710 + }, + { + "epoch": 0.5587892898719441, + "grad_norm": 0.2719033658504486, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 720 + }, + { + "epoch": 0.5665502522312766, + "grad_norm": 0.3176722526550293, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 730 + }, + { + "epoch": 0.5743112145906092, + "grad_norm": 0.32491734623908997, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 740 + }, + { + "epoch": 0.5820721769499418, + "grad_norm": 0.32746851444244385, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 750 + }, + { + "epoch": 0.5898331393092744, + "grad_norm": 0.3055773973464966, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 760 + }, + { + "epoch": 0.5975941016686069, + "grad_norm": 0.30671584606170654, + "learning_rate": 0.0002, + "loss": 1.8597, + "step": 770 + }, + { + "epoch": 0.6053550640279395, + "grad_norm": 0.28770264983177185, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 780 + }, + { + "epoch": 0.613116026387272, + "grad_norm": 0.2814285457134247, + "learning_rate": 0.0002, + "loss": 1.7025, + "step": 790 + }, + { + "epoch": 0.6208769887466046, + "grad_norm": 0.31554412841796875, + "learning_rate": 0.0002, + "loss": 1.819, + "step": 800 + }, + { + "epoch": 0.6286379511059371, + "grad_norm": 0.2984226942062378, + "learning_rate": 0.0002, + "loss": 1.8335, + "step": 810 + }, + { + "epoch": 0.6363989134652697, + "grad_norm": 0.2859906554222107, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 820 + }, + { + "epoch": 0.6441598758246022, + "grad_norm": 0.2887928783893585, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 830 + }, + { + "epoch": 0.6519208381839348, + "grad_norm": 0.31287339329719543, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 840 + }, + { + "epoch": 0.6596818005432674, + "grad_norm": 0.32064181566238403, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 850 + }, + { + "epoch": 0.6674427629026, + "grad_norm": 0.290981650352478, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 860 + }, + { + "epoch": 0.6752037252619325, + "grad_norm": 0.33060121536254883, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 870 + }, + { + "epoch": 0.682964687621265, + "grad_norm": 0.27032899856567383, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 880 + }, + { + "epoch": 0.6907256499805976, + "grad_norm": 0.29031234979629517, + "learning_rate": 0.0002, + "loss": 1.8423, + "step": 890 + }, + { + "epoch": 0.6984866123399301, + "grad_norm": 0.2845142185688019, + "learning_rate": 0.0002, + "loss": 1.835, + "step": 900 + }, + { + "epoch": 0.7062475746992627, + "grad_norm": 0.8638312816619873, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 910 + }, + { + "epoch": 0.7140085370585952, + "grad_norm": 0.3086668848991394, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 920 + }, + { + "epoch": 0.7217694994179278, + "grad_norm": 0.2724177837371826, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 930 + }, + { + "epoch": 0.7295304617772603, + "grad_norm": 0.289559006690979, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 940 + }, + { + "epoch": 0.737291424136593, + "grad_norm": 0.3000658452510834, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 950 + }, + { + "epoch": 0.7450523864959255, + "grad_norm": 0.33544042706489563, + "learning_rate": 0.0002, + "loss": 1.7736, + "step": 960 + }, + { + "epoch": 0.7528133488552581, + "grad_norm": 0.28593236207962036, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 970 + }, + { + "epoch": 0.7605743112145906, + "grad_norm": 0.313634991645813, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 980 + }, + { + "epoch": 0.7683352735739232, + "grad_norm": 0.2949385941028595, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 990 + }, + { + "epoch": 0.7760962359332557, + "grad_norm": 0.2920108437538147, + "learning_rate": 0.0002, + "loss": 1.8689, + "step": 1000 + }, + { + "epoch": 0.7838571982925883, + "grad_norm": 0.3245100677013397, + "learning_rate": 0.0002, + "loss": 1.8401, + "step": 1010 + }, + { + "epoch": 0.7916181606519208, + "grad_norm": 0.3007619380950928, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 1020 + }, + { + "epoch": 0.7993791230112534, + "grad_norm": 0.3630852997303009, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 1030 + }, + { + "epoch": 0.8071400853705859, + "grad_norm": 0.2856379747390747, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 1040 + }, + { + "epoch": 0.8149010477299186, + "grad_norm": 0.32476478815078735, + "learning_rate": 0.0002, + "loss": 1.8371, + "step": 1050 + }, + { + "epoch": 0.8226620100892511, + "grad_norm": 0.5162565112113953, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 1060 + }, + { + "epoch": 0.8304229724485837, + "grad_norm": 0.316496342420578, + "learning_rate": 0.0002, + "loss": 1.8862, + "step": 1070 + }, + { + "epoch": 0.8381839348079162, + "grad_norm": 0.31977516412734985, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 1080 + }, + { + "epoch": 0.8459448971672487, + "grad_norm": 0.269509494304657, + "learning_rate": 0.0002, + "loss": 1.8547, + "step": 1090 + }, + { + "epoch": 0.8537058595265813, + "grad_norm": 0.31621453166007996, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 1100 + }, + { + "epoch": 0.8614668218859138, + "grad_norm": 0.2946535050868988, + "learning_rate": 0.0002, + "loss": 1.739, + "step": 1110 + }, + { + "epoch": 0.8692277842452464, + "grad_norm": 0.3088909983634949, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 1120 + }, + { + "epoch": 0.8769887466045789, + "grad_norm": 0.33033716678619385, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 1130 + }, + { + "epoch": 0.8847497089639115, + "grad_norm": 0.2954833507537842, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1140 + }, + { + "epoch": 0.8925106713232441, + "grad_norm": 0.2950248122215271, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 1150 + }, + { + "epoch": 0.9002716336825767, + "grad_norm": 0.296661913394928, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 1160 + }, + { + "epoch": 0.9080325960419092, + "grad_norm": 0.35451310873031616, + "learning_rate": 0.0002, + "loss": 1.7967, + "step": 1170 + }, + { + "epoch": 0.9157935584012418, + "grad_norm": 0.32705947756767273, + "learning_rate": 0.0002, + "loss": 1.8202, + "step": 1180 + }, + { + "epoch": 0.9235545207605743, + "grad_norm": 0.3333960771560669, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1190 + }, + { + "epoch": 0.9313154831199069, + "grad_norm": 0.3042232096195221, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 1200 + }, + { + "epoch": 0.9390764454792394, + "grad_norm": 0.281553715467453, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1210 + }, + { + "epoch": 0.946837407838572, + "grad_norm": 0.3096391558647156, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 1220 + }, + { + "epoch": 0.9545983701979045, + "grad_norm": 0.2866271734237671, + "learning_rate": 0.0002, + "loss": 1.7401, + "step": 1230 + }, + { + "epoch": 0.962359332557237, + "grad_norm": 0.28394097089767456, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 1240 + }, + { + "epoch": 0.9701202949165697, + "grad_norm": 0.3249266743659973, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 1250 + }, + { + "epoch": 0.9778812572759022, + "grad_norm": 0.2896869480609894, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1260 + }, + { + "epoch": 0.9856422196352348, + "grad_norm": 0.29224586486816406, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 1270 + }, + { + "epoch": 0.9934031819945673, + "grad_norm": 0.2820223569869995, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1280 + }, + { + "epoch": 0.9996119518820333, + "eval_loss": 1.8081045150756836, + "eval_runtime": 102.3056, + "eval_samples_per_second": 4.956, + "eval_steps_per_second": 0.626, + "step": 1288 + }, + { + "epoch": 1.0011641443538999, + "grad_norm": 0.3282551169395447, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 1290 + }, + { + "epoch": 1.0089251067132325, + "grad_norm": 0.30217495560646057, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1300 + }, + { + "epoch": 1.016686069072565, + "grad_norm": 0.30801767110824585, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 1310 + }, + { + "epoch": 1.0244470314318976, + "grad_norm": 0.31816792488098145, + "learning_rate": 0.0002, + "loss": 1.7756, + "step": 1320 + }, + { + "epoch": 1.03220799379123, + "grad_norm": 0.27794334292411804, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 1330 + }, + { + "epoch": 1.0399689561505627, + "grad_norm": 0.3018926680088043, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 1340 + }, + { + "epoch": 1.0477299185098952, + "grad_norm": 0.3552975356578827, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1350 + }, + { + "epoch": 1.0554908808692278, + "grad_norm": 0.32590144872665405, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 1360 + }, + { + "epoch": 1.0632518432285603, + "grad_norm": 0.3435460925102234, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 1370 + }, + { + "epoch": 1.071012805587893, + "grad_norm": 0.35037797689437866, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 1380 + }, + { + "epoch": 1.0787737679472253, + "grad_norm": 0.31398263573646545, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 1390 + }, + { + "epoch": 1.086534730306558, + "grad_norm": 0.3134010434150696, + "learning_rate": 0.0002, + "loss": 1.6729, + "step": 1400 + }, + { + "epoch": 1.0942956926658907, + "grad_norm": 0.4599704444408417, + "learning_rate": 0.0002, + "loss": 1.751, + "step": 1410 + }, + { + "epoch": 1.102056655025223, + "grad_norm": 0.35852891206741333, + "learning_rate": 0.0002, + "loss": 1.6871, + "step": 1420 + }, + { + "epoch": 1.1098176173845558, + "grad_norm": 0.35628634691238403, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 1430 + }, + { + "epoch": 1.1175785797438882, + "grad_norm": 0.3769161105155945, + "learning_rate": 0.0002, + "loss": 1.6166, + "step": 1440 + }, + { + "epoch": 1.1253395421032208, + "grad_norm": 1.3712416887283325, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1450 + }, + { + "epoch": 1.1331005044625533, + "grad_norm": 0.38406670093536377, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 1460 + }, + { + "epoch": 1.140861466821886, + "grad_norm": 0.3402116000652313, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 1470 + }, + { + "epoch": 1.1486224291812184, + "grad_norm": 0.341189444065094, + "learning_rate": 0.0002, + "loss": 1.7074, + "step": 1480 + }, + { + "epoch": 1.156383391540551, + "grad_norm": 0.36629995703697205, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 1490 + }, + { + "epoch": 1.1641443538998835, + "grad_norm": 0.3499569296836853, + "learning_rate": 0.0002, + "loss": 1.6952, + "step": 1500 + }, + { + "epoch": 1.1719053162592161, + "grad_norm": 0.3663063943386078, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 1510 + }, + { + "epoch": 1.1796662786185488, + "grad_norm": 0.34851500391960144, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1520 + }, + { + "epoch": 1.1874272409778812, + "grad_norm": 0.35071656107902527, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 1530 + }, + { + "epoch": 1.1951882033372139, + "grad_norm": 0.42783796787261963, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 1540 + }, + { + "epoch": 1.2029491656965463, + "grad_norm": 0.31830692291259766, + "learning_rate": 0.0002, + "loss": 1.7499, + "step": 1550 + }, + { + "epoch": 1.210710128055879, + "grad_norm": 0.3597424626350403, + "learning_rate": 0.0002, + "loss": 1.7372, + "step": 1560 + }, + { + "epoch": 1.2184710904152114, + "grad_norm": 0.35233765840530396, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 1570 + }, + { + "epoch": 1.226232052774544, + "grad_norm": 0.35942912101745605, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 1580 + }, + { + "epoch": 1.2339930151338767, + "grad_norm": 0.36159393191337585, + "learning_rate": 0.0002, + "loss": 1.6598, + "step": 1590 + }, + { + "epoch": 1.2417539774932091, + "grad_norm": 0.3328469693660736, + "learning_rate": 0.0002, + "loss": 1.6697, + "step": 1600 + }, + { + "epoch": 1.2495149398525418, + "grad_norm": 0.3089476525783539, + "learning_rate": 0.0002, + "loss": 1.7594, + "step": 1610 + }, + { + "epoch": 1.2572759022118742, + "grad_norm": 0.30947765707969666, + "learning_rate": 0.0002, + "loss": 1.6805, + "step": 1620 + }, + { + "epoch": 1.265036864571207, + "grad_norm": 0.32154011726379395, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 1630 + }, + { + "epoch": 1.2727978269305393, + "grad_norm": 0.3480297923088074, + "learning_rate": 0.0002, + "loss": 1.6621, + "step": 1640 + }, + { + "epoch": 1.280558789289872, + "grad_norm": 0.39471694827079773, + "learning_rate": 0.0002, + "loss": 1.7087, + "step": 1650 + }, + { + "epoch": 1.2883197516492044, + "grad_norm": 0.35728853940963745, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 1660 + }, + { + "epoch": 1.296080714008537, + "grad_norm": 0.35223081707954407, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 1670 + }, + { + "epoch": 1.3038416763678695, + "grad_norm": 0.3588867485523224, + "learning_rate": 0.0002, + "loss": 1.7253, + "step": 1680 + }, + { + "epoch": 1.3116026387272022, + "grad_norm": 0.3528042733669281, + "learning_rate": 0.0002, + "loss": 1.6505, + "step": 1690 + }, + { + "epoch": 1.3193636010865348, + "grad_norm": 0.35975801944732666, + "learning_rate": 0.0002, + "loss": 1.6945, + "step": 1700 + }, + { + "epoch": 1.3271245634458673, + "grad_norm": 0.36691880226135254, + "learning_rate": 0.0002, + "loss": 1.6631, + "step": 1710 + }, + { + "epoch": 1.3348855258052, + "grad_norm": 0.3787977695465088, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1720 + }, + { + "epoch": 1.3426464881645324, + "grad_norm": 0.36614933609962463, + "learning_rate": 0.0002, + "loss": 1.7697, + "step": 1730 + }, + { + "epoch": 1.350407450523865, + "grad_norm": 0.3484745919704437, + "learning_rate": 0.0002, + "loss": 1.6487, + "step": 1740 + }, + { + "epoch": 1.3581684128831975, + "grad_norm": 0.36905673146247864, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 1750 + }, + { + "epoch": 1.36592937524253, + "grad_norm": 0.41564738750457764, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 1760 + }, + { + "epoch": 1.3736903376018628, + "grad_norm": 0.3345205783843994, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 1770 + }, + { + "epoch": 1.3814512999611952, + "grad_norm": 0.34926071763038635, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 1780 + }, + { + "epoch": 1.3892122623205276, + "grad_norm": 0.42004233598709106, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 1790 + }, + { + "epoch": 1.3969732246798603, + "grad_norm": 0.3576236963272095, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 1800 + }, + { + "epoch": 1.404734187039193, + "grad_norm": 0.3586704432964325, + "learning_rate": 0.0002, + "loss": 1.8516, + "step": 1810 + }, + { + "epoch": 1.4124951493985254, + "grad_norm": 0.3943439722061157, + "learning_rate": 0.0002, + "loss": 1.6171, + "step": 1820 + }, + { + "epoch": 1.420256111757858, + "grad_norm": 0.3484877049922943, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 1830 + }, + { + "epoch": 1.4280170741171905, + "grad_norm": 0.3344518840312958, + "learning_rate": 0.0002, + "loss": 1.7205, + "step": 1840 + }, + { + "epoch": 1.4357780364765231, + "grad_norm": 0.4345698356628418, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1850 + }, + { + "epoch": 1.4435389988358556, + "grad_norm": 0.5525162220001221, + "learning_rate": 0.0002, + "loss": 1.6855, + "step": 1860 + }, + { + "epoch": 1.4512999611951882, + "grad_norm": 0.37194496393203735, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1870 + }, + { + "epoch": 1.4590609235545209, + "grad_norm": 0.34570157527923584, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1880 + }, + { + "epoch": 1.4668218859138533, + "grad_norm": 0.3512282073497772, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 1890 + }, + { + "epoch": 1.4745828482731858, + "grad_norm": 0.3443922996520996, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1900 + }, + { + "epoch": 1.4823438106325184, + "grad_norm": 0.3812018036842346, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 1910 + }, + { + "epoch": 1.490104772991851, + "grad_norm": 0.39263492822647095, + "learning_rate": 0.0002, + "loss": 1.7277, + "step": 1920 + }, + { + "epoch": 1.4978657353511835, + "grad_norm": 0.3146156072616577, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1930 + }, + { + "epoch": 1.505626697710516, + "grad_norm": 0.3653988540172577, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 1940 + }, + { + "epoch": 1.5133876600698488, + "grad_norm": 0.3966596722602844, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 1950 + }, + { + "epoch": 1.5211486224291813, + "grad_norm": 0.3441697359085083, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1960 + }, + { + "epoch": 1.5289095847885137, + "grad_norm": 0.3328564465045929, + "learning_rate": 0.0002, + "loss": 1.7175, + "step": 1970 + }, + { + "epoch": 1.5366705471478463, + "grad_norm": 0.34068772196769714, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 1980 + }, + { + "epoch": 1.544431509507179, + "grad_norm": 0.3559795916080475, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 1990 + }, + { + "epoch": 1.5521924718665114, + "grad_norm": 0.37888768315315247, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 2000 + }, + { + "epoch": 1.5599534342258439, + "grad_norm": 0.36128363013267517, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 2010 + }, + { + "epoch": 1.5677143965851765, + "grad_norm": 0.3643714487552643, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 2020 + }, + { + "epoch": 1.5754753589445092, + "grad_norm": 0.3863612115383148, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 2030 + }, + { + "epoch": 1.5832363213038416, + "grad_norm": 0.32831457257270813, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 2040 + }, + { + "epoch": 1.5909972836631743, + "grad_norm": 0.36098113656044006, + "learning_rate": 0.0002, + "loss": 1.7404, + "step": 2050 + }, + { + "epoch": 1.598758246022507, + "grad_norm": 1.1079334020614624, + "learning_rate": 0.0002, + "loss": 1.7065, + "step": 2060 + }, + { + "epoch": 1.6065192083818394, + "grad_norm": 0.35615381598472595, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2070 + }, + { + "epoch": 1.6142801707411718, + "grad_norm": 0.369711309671402, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 2080 + }, + { + "epoch": 1.6220411331005045, + "grad_norm": 0.390658438205719, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 2090 + }, + { + "epoch": 1.6298020954598371, + "grad_norm": 0.3422999382019043, + "learning_rate": 0.0002, + "loss": 1.6996, + "step": 2100 + }, + { + "epoch": 1.6375630578191696, + "grad_norm": 0.372475266456604, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 2110 + }, + { + "epoch": 1.645324020178502, + "grad_norm": 0.35660576820373535, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 2120 + }, + { + "epoch": 1.6530849825378346, + "grad_norm": 0.35754942893981934, + "learning_rate": 0.0002, + "loss": 1.6991, + "step": 2130 + }, + { + "epoch": 1.6608459448971673, + "grad_norm": 0.34572410583496094, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 2140 + }, + { + "epoch": 1.6686069072564997, + "grad_norm": 0.42059701681137085, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 2150 + }, + { + "epoch": 1.6763678696158324, + "grad_norm": 0.35200759768486023, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 2160 + }, + { + "epoch": 1.684128831975165, + "grad_norm": 0.3704029321670532, + "learning_rate": 0.0002, + "loss": 1.6869, + "step": 2170 + }, + { + "epoch": 1.6918897943344975, + "grad_norm": 0.40450501441955566, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 2180 + }, + { + "epoch": 1.69965075669383, + "grad_norm": 0.362966924905777, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2190 + }, + { + "epoch": 1.7074117190531626, + "grad_norm": 0.36586204171180725, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 2200 + }, + { + "epoch": 1.7151726814124952, + "grad_norm": 0.3295372426509857, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 2210 + }, + { + "epoch": 1.7229336437718277, + "grad_norm": 0.3892575800418854, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 2220 + }, + { + "epoch": 1.73069460613116, + "grad_norm": 0.34712135791778564, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 2230 + }, + { + "epoch": 1.738455568490493, + "grad_norm": 0.34801796078681946, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 2240 + }, + { + "epoch": 1.7462165308498254, + "grad_norm": 0.3822397291660309, + "learning_rate": 0.0002, + "loss": 1.7009, + "step": 2250 + }, + { + "epoch": 1.7539774932091579, + "grad_norm": 0.38933250308036804, + "learning_rate": 0.0002, + "loss": 1.6546, + "step": 2260 + }, + { + "epoch": 1.7617384555684905, + "grad_norm": 0.3798373341560364, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 2270 + }, + { + "epoch": 1.7694994179278232, + "grad_norm": 0.35151317715644836, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 2280 + }, + { + "epoch": 1.7772603802871556, + "grad_norm": 0.44981494545936584, + "learning_rate": 0.0002, + "loss": 1.6894, + "step": 2290 + }, + { + "epoch": 1.785021342646488, + "grad_norm": 0.3992624580860138, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 2300 + }, + { + "epoch": 1.7927823050058207, + "grad_norm": 0.3772512376308441, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 2310 + }, + { + "epoch": 1.8005432673651534, + "grad_norm": 0.3511589467525482, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2320 + }, + { + "epoch": 1.8083042297244858, + "grad_norm": 0.3805285394191742, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 2330 + }, + { + "epoch": 1.8160651920838184, + "grad_norm": 0.3792071044445038, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 2340 + }, + { + "epoch": 1.823826154443151, + "grad_norm": 0.36430829763412476, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 2350 + }, + { + "epoch": 1.8315871168024835, + "grad_norm": 0.36502477526664734, + "learning_rate": 0.0002, + "loss": 1.6773, + "step": 2360 + }, + { + "epoch": 1.839348079161816, + "grad_norm": 0.35015153884887695, + "learning_rate": 0.0002, + "loss": 1.8072, + "step": 2370 + }, + { + "epoch": 1.8471090415211486, + "grad_norm": 0.3710903823375702, + "learning_rate": 0.0002, + "loss": 1.7734, + "step": 2380 + }, + { + "epoch": 1.8548700038804813, + "grad_norm": 0.3542828857898712, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 2390 + }, + { + "epoch": 1.8626309662398137, + "grad_norm": 0.35467568039894104, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 2400 + }, + { + "epoch": 1.8703919285991462, + "grad_norm": 0.3638560473918915, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 2410 + }, + { + "epoch": 1.8781528909584788, + "grad_norm": 0.3823298215866089, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 2420 + }, + { + "epoch": 1.8859138533178115, + "grad_norm": 0.3926416337490082, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 2430 + }, + { + "epoch": 1.893674815677144, + "grad_norm": 0.3608079254627228, + "learning_rate": 0.0002, + "loss": 1.71, + "step": 2440 + }, + { + "epoch": 1.9014357780364766, + "grad_norm": 0.3426613509654999, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 2450 + }, + { + "epoch": 1.9091967403958092, + "grad_norm": 0.3522338569164276, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 2460 + }, + { + "epoch": 1.9169577027551417, + "grad_norm": 0.3608049154281616, + "learning_rate": 0.0002, + "loss": 1.7307, + "step": 2470 + }, + { + "epoch": 1.924718665114474, + "grad_norm": 0.3849755525588989, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 2480 + }, + { + "epoch": 1.9324796274738067, + "grad_norm": 0.4154011011123657, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 2490 + }, + { + "epoch": 1.9402405898331394, + "grad_norm": 0.3602796792984009, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 2500 + }, + { + "epoch": 1.9480015521924718, + "grad_norm": 0.3702992796897888, + "learning_rate": 0.0002, + "loss": 1.7843, + "step": 2510 + }, + { + "epoch": 1.9557625145518043, + "grad_norm": 0.3657735288143158, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 2520 + }, + { + "epoch": 1.963523476911137, + "grad_norm": 0.41031739115715027, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2530 + }, + { + "epoch": 1.9712844392704696, + "grad_norm": 0.34578680992126465, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 2540 + }, + { + "epoch": 1.979045401629802, + "grad_norm": 0.3361521065235138, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 2550 + }, + { + "epoch": 1.9868063639891347, + "grad_norm": 0.34342363476753235, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 2560 + }, + { + "epoch": 1.9945673263484673, + "grad_norm": 0.32954007387161255, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 2570 + }, + { + "epoch": 2.0, + "eval_loss": 1.8068748712539673, + "eval_runtime": 105.5885, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 2577 + }, + { + "epoch": 2.0023282887077998, + "grad_norm": 0.336302250623703, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 2580 + }, + { + "epoch": 2.010089251067132, + "grad_norm": 0.3627048432826996, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 2590 + }, + { + "epoch": 2.017850213426465, + "grad_norm": 0.38406702876091003, + "learning_rate": 0.0002, + "loss": 1.4908, + "step": 2600 + }, + { + "epoch": 2.0256111757857975, + "grad_norm": 0.5326781272888184, + "learning_rate": 0.0002, + "loss": 1.5368, + "step": 2610 + }, + { + "epoch": 2.03337213814513, + "grad_norm": 0.4774554967880249, + "learning_rate": 0.0002, + "loss": 1.5727, + "step": 2620 + }, + { + "epoch": 2.0411331005044624, + "grad_norm": 0.4251810312271118, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 2630 + }, + { + "epoch": 2.0488940628637953, + "grad_norm": 0.4693007171154022, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 2640 + }, + { + "epoch": 2.0566550252231277, + "grad_norm": 0.46371519565582275, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 2650 + }, + { + "epoch": 2.06441598758246, + "grad_norm": 0.46652570366859436, + "learning_rate": 0.0002, + "loss": 1.6304, + "step": 2660 + }, + { + "epoch": 2.0721769499417926, + "grad_norm": 0.45200315117836, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 2670 + }, + { + "epoch": 2.0799379123011255, + "grad_norm": 0.42905205488204956, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 2680 + }, + { + "epoch": 2.087698874660458, + "grad_norm": 0.44509148597717285, + "learning_rate": 0.0002, + "loss": 1.5401, + "step": 2690 + }, + { + "epoch": 2.0954598370197903, + "grad_norm": 0.4445319175720215, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 2700 + }, + { + "epoch": 2.103220799379123, + "grad_norm": 0.46825504302978516, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 2710 + }, + { + "epoch": 2.1109817617384556, + "grad_norm": 0.4623856842517853, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 2720 + }, + { + "epoch": 2.118742724097788, + "grad_norm": 0.4833452105522156, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2730 + }, + { + "epoch": 2.1265036864571205, + "grad_norm": 0.4582686722278595, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 2740 + }, + { + "epoch": 2.1342646488164534, + "grad_norm": 0.47587934136390686, + "learning_rate": 0.0002, + "loss": 1.5801, + "step": 2750 + }, + { + "epoch": 2.142025611175786, + "grad_norm": 0.4602217972278595, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 2760 + }, + { + "epoch": 2.1497865735351183, + "grad_norm": 0.47501352429389954, + "learning_rate": 0.0002, + "loss": 1.5271, + "step": 2770 + }, + { + "epoch": 2.1575475358944507, + "grad_norm": 0.5078499913215637, + "learning_rate": 0.0002, + "loss": 1.4862, + "step": 2780 + }, + { + "epoch": 2.1653084982537836, + "grad_norm": 0.497704416513443, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 2790 + }, + { + "epoch": 2.173069460613116, + "grad_norm": 0.5435971617698669, + "learning_rate": 0.0002, + "loss": 1.5597, + "step": 2800 + }, + { + "epoch": 2.1808304229724484, + "grad_norm": 0.5172356367111206, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 2810 + }, + { + "epoch": 2.1885913853317813, + "grad_norm": 0.44063422083854675, + "learning_rate": 0.0002, + "loss": 1.5202, + "step": 2820 + }, + { + "epoch": 2.1963523476911138, + "grad_norm": 0.5079569220542908, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 2830 + }, + { + "epoch": 2.204113310050446, + "grad_norm": 0.45658132433891296, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2840 + }, + { + "epoch": 2.2118742724097786, + "grad_norm": 0.5103023648262024, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 2850 + }, + { + "epoch": 2.2196352347691115, + "grad_norm": 0.4882226288318634, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 2860 + }, + { + "epoch": 2.227396197128444, + "grad_norm": 0.5087296962738037, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 2870 + }, + { + "epoch": 2.2351571594877764, + "grad_norm": 0.45293712615966797, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 2880 + }, + { + "epoch": 2.242918121847109, + "grad_norm": 0.5120379328727722, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 2890 + }, + { + "epoch": 2.2506790842064417, + "grad_norm": 0.47126415371894836, + "learning_rate": 0.0002, + "loss": 1.5273, + "step": 2900 + }, + { + "epoch": 2.258440046565774, + "grad_norm": 0.44005846977233887, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 2910 + }, + { + "epoch": 2.2662010089251066, + "grad_norm": 0.46476176381111145, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 2920 + }, + { + "epoch": 2.2739619712844394, + "grad_norm": 0.48051515221595764, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 2930 + }, + { + "epoch": 2.281722933643772, + "grad_norm": 0.480069637298584, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2940 + }, + { + "epoch": 2.2894838960031043, + "grad_norm": 0.5122102499008179, + "learning_rate": 0.0002, + "loss": 1.5747, + "step": 2950 + }, + { + "epoch": 2.2972448583624367, + "grad_norm": 0.48879891633987427, + "learning_rate": 0.0002, + "loss": 1.5183, + "step": 2960 + }, + { + "epoch": 2.3050058207217696, + "grad_norm": 0.4973136782646179, + "learning_rate": 0.0002, + "loss": 1.5483, + "step": 2970 + }, + { + "epoch": 2.312766783081102, + "grad_norm": 0.5522695183753967, + "learning_rate": 0.0002, + "loss": 1.677, + "step": 2980 + }, + { + "epoch": 2.3205277454404345, + "grad_norm": 0.5220217704772949, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 2990 + }, + { + "epoch": 2.328288707799767, + "grad_norm": 0.4978662431240082, + "learning_rate": 0.0002, + "loss": 1.6299, + "step": 3000 + }, + { + "epoch": 2.3360496701591, + "grad_norm": 0.554053544998169, + "learning_rate": 0.0002, + "loss": 1.5498, + "step": 3010 + }, + { + "epoch": 2.3438106325184322, + "grad_norm": 0.4703886806964874, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 3020 + }, + { + "epoch": 2.3515715948777647, + "grad_norm": 0.5074123740196228, + "learning_rate": 0.0002, + "loss": 1.5418, + "step": 3030 + }, + { + "epoch": 2.3593325572370976, + "grad_norm": 0.5088278651237488, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 3040 + }, + { + "epoch": 2.36709351959643, + "grad_norm": 0.4752114415168762, + "learning_rate": 0.0002, + "loss": 1.5249, + "step": 3050 + }, + { + "epoch": 2.3748544819557624, + "grad_norm": 0.5121659636497498, + "learning_rate": 0.0002, + "loss": 1.5353, + "step": 3060 + }, + { + "epoch": 2.3826154443150953, + "grad_norm": 0.48649218678474426, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 3070 + }, + { + "epoch": 2.3903764066744277, + "grad_norm": 0.5209488868713379, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 3080 + }, + { + "epoch": 2.39813736903376, + "grad_norm": 0.5110517740249634, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 3090 + }, + { + "epoch": 2.4058983313930926, + "grad_norm": 0.5609337091445923, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3100 + }, + { + "epoch": 2.4136592937524255, + "grad_norm": 0.5191826224327087, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 3110 + }, + { + "epoch": 2.421420256111758, + "grad_norm": 0.4876069724559784, + "learning_rate": 0.0002, + "loss": 1.6347, + "step": 3120 + }, + { + "epoch": 2.4291812184710904, + "grad_norm": 0.4713933765888214, + "learning_rate": 0.0002, + "loss": 1.5565, + "step": 3130 + }, + { + "epoch": 2.436942180830423, + "grad_norm": 0.5102227330207825, + "learning_rate": 0.0002, + "loss": 1.6388, + "step": 3140 + }, + { + "epoch": 2.4447031431897557, + "grad_norm": 0.44546666741371155, + "learning_rate": 0.0002, + "loss": 1.5667, + "step": 3150 + }, + { + "epoch": 2.452464105549088, + "grad_norm": 0.5167558193206787, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3160 + }, + { + "epoch": 2.4602250679084205, + "grad_norm": 0.5226958990097046, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 3170 + }, + { + "epoch": 2.4679860302677534, + "grad_norm": 0.4751799702644348, + "learning_rate": 0.0002, + "loss": 1.5758, + "step": 3180 + }, + { + "epoch": 2.475746992627086, + "grad_norm": 0.4744729697704315, + "learning_rate": 0.0002, + "loss": 1.6234, + "step": 3190 + }, + { + "epoch": 2.4835079549864183, + "grad_norm": 0.5203230381011963, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 3200 + }, + { + "epoch": 2.4912689173457507, + "grad_norm": 0.47209781408309937, + "learning_rate": 0.0002, + "loss": 1.493, + "step": 3210 + }, + { + "epoch": 2.4990298797050836, + "grad_norm": 0.5241674780845642, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3220 + }, + { + "epoch": 2.506790842064416, + "grad_norm": 0.5152244567871094, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 3230 + }, + { + "epoch": 2.5145518044237485, + "grad_norm": 0.5216741561889648, + "learning_rate": 0.0002, + "loss": 1.6248, + "step": 3240 + }, + { + "epoch": 2.522312766783081, + "grad_norm": 0.4953259527683258, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 3250 + }, + { + "epoch": 2.530073729142414, + "grad_norm": 0.5973829030990601, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 3260 + }, + { + "epoch": 2.5378346915017462, + "grad_norm": 0.48804202675819397, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 3270 + }, + { + "epoch": 2.5455956538610787, + "grad_norm": 0.5334644317626953, + "learning_rate": 0.0002, + "loss": 1.4954, + "step": 3280 + }, + { + "epoch": 2.5533566162204115, + "grad_norm": 0.46873313188552856, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 3290 + }, + { + "epoch": 2.561117578579744, + "grad_norm": 0.4282589554786682, + "learning_rate": 0.0002, + "loss": 1.5362, + "step": 3300 + }, + { + "epoch": 2.5688785409390764, + "grad_norm": 0.4848293960094452, + "learning_rate": 0.0002, + "loss": 1.6278, + "step": 3310 + }, + { + "epoch": 2.576639503298409, + "grad_norm": 0.5093745589256287, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 3320 + }, + { + "epoch": 2.5844004656577413, + "grad_norm": 0.5084842443466187, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 3330 + }, + { + "epoch": 2.592161428017074, + "grad_norm": 0.4696281850337982, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 3340 + }, + { + "epoch": 2.5999223903764066, + "grad_norm": 0.5767765641212463, + "learning_rate": 0.0002, + "loss": 1.5359, + "step": 3350 + }, + { + "epoch": 2.607683352735739, + "grad_norm": 0.47300875186920166, + "learning_rate": 0.0002, + "loss": 1.6097, + "step": 3360 + }, + { + "epoch": 2.615444315095072, + "grad_norm": 0.4809158146381378, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 3370 + }, + { + "epoch": 2.6232052774544043, + "grad_norm": 0.5141063928604126, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 3380 + }, + { + "epoch": 2.630966239813737, + "grad_norm": 0.4832935035228729, + "learning_rate": 0.0002, + "loss": 1.5784, + "step": 3390 + }, + { + "epoch": 2.6387272021730697, + "grad_norm": 0.5044625401496887, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 3400 + }, + { + "epoch": 2.646488164532402, + "grad_norm": 0.5287680625915527, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 3410 + }, + { + "epoch": 2.6542491268917345, + "grad_norm": 0.5306379795074463, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 3420 + }, + { + "epoch": 2.662010089251067, + "grad_norm": 0.5849291682243347, + "learning_rate": 0.0002, + "loss": 1.5264, + "step": 3430 + }, + { + "epoch": 2.6697710516104, + "grad_norm": 0.7951080799102783, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 3440 + }, + { + "epoch": 2.6775320139697323, + "grad_norm": 0.48087653517723083, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 3450 + }, + { + "epoch": 2.6852929763290647, + "grad_norm": 0.5396431684494019, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 3460 + }, + { + "epoch": 2.693053938688397, + "grad_norm": 0.5481634736061096, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 3470 + }, + { + "epoch": 2.70081490104773, + "grad_norm": 0.5068731307983398, + "learning_rate": 0.0002, + "loss": 1.6436, + "step": 3480 + }, + { + "epoch": 2.7085758634070625, + "grad_norm": 0.5759826898574829, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 3490 + }, + { + "epoch": 2.716336825766395, + "grad_norm": 0.7253932952880859, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 3500 + }, + { + "epoch": 2.724097788125728, + "grad_norm": 0.527745246887207, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 3510 + }, + { + "epoch": 2.73185875048506, + "grad_norm": 0.5279242396354675, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 3520 + }, + { + "epoch": 2.7396197128443927, + "grad_norm": 0.5047839283943176, + "learning_rate": 0.0002, + "loss": 1.6768, + "step": 3530 + }, + { + "epoch": 2.7473806752037255, + "grad_norm": 0.5430883169174194, + "learning_rate": 0.0002, + "loss": 1.5517, + "step": 3540 + }, + { + "epoch": 2.755141637563058, + "grad_norm": 0.4496723711490631, + "learning_rate": 0.0002, + "loss": 1.5624, + "step": 3550 + }, + { + "epoch": 2.7629025999223904, + "grad_norm": 0.5063338875770569, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 3560 + }, + { + "epoch": 2.770663562281723, + "grad_norm": 0.4619026780128479, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 3570 + }, + { + "epoch": 2.7784245246410553, + "grad_norm": 0.4753304123878479, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 3580 + }, + { + "epoch": 2.786185487000388, + "grad_norm": 0.5422708988189697, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 3590 + }, + { + "epoch": 2.7939464493597206, + "grad_norm": 0.4756578803062439, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 3600 + }, + { + "epoch": 2.801707411719053, + "grad_norm": 0.5057567358016968, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 3610 + }, + { + "epoch": 2.809468374078386, + "grad_norm": 0.5410919785499573, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 3620 + }, + { + "epoch": 2.8172293364377183, + "grad_norm": 0.4958136975765228, + "learning_rate": 0.0002, + "loss": 1.5573, + "step": 3630 + }, + { + "epoch": 2.8249902987970508, + "grad_norm": 0.454527348279953, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 3640 + }, + { + "epoch": 2.8327512611563836, + "grad_norm": 0.5092706084251404, + "learning_rate": 0.0002, + "loss": 1.5582, + "step": 3650 + }, + { + "epoch": 2.840512223515716, + "grad_norm": 0.5314022302627563, + "learning_rate": 0.0002, + "loss": 1.5893, + "step": 3660 + }, + { + "epoch": 2.8482731858750485, + "grad_norm": 0.5028239488601685, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 3670 + }, + { + "epoch": 2.856034148234381, + "grad_norm": 0.5127444863319397, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 3680 + }, + { + "epoch": 2.8637951105937134, + "grad_norm": 0.5045645236968994, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 3690 + }, + { + "epoch": 2.8715560729530463, + "grad_norm": 0.5560781955718994, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 3700 + }, + { + "epoch": 2.8793170353123787, + "grad_norm": 0.5177600383758545, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 3710 + }, + { + "epoch": 2.887077997671711, + "grad_norm": 0.45830899477005005, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 3720 + }, + { + "epoch": 2.894838960031044, + "grad_norm": 0.4828629195690155, + "learning_rate": 0.0002, + "loss": 1.6344, + "step": 3730 + }, + { + "epoch": 2.9025999223903765, + "grad_norm": 0.48241183161735535, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 3740 + }, + { + "epoch": 2.910360884749709, + "grad_norm": 0.4909592568874359, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 3750 + }, + { + "epoch": 2.9181218471090418, + "grad_norm": 0.44677025079727173, + "learning_rate": 0.0002, + "loss": 1.4927, + "step": 3760 + }, + { + "epoch": 2.925882809468374, + "grad_norm": 0.4928834140300751, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 3770 + }, + { + "epoch": 2.9336437718277066, + "grad_norm": 0.5673553347587585, + "learning_rate": 0.0002, + "loss": 1.5843, + "step": 3780 + }, + { + "epoch": 2.941404734187039, + "grad_norm": 0.548190712928772, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 3790 + }, + { + "epoch": 2.9491656965463715, + "grad_norm": 0.48979803919792175, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 3800 + }, + { + "epoch": 2.9569266589057044, + "grad_norm": 0.533191978931427, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 3810 + }, + { + "epoch": 2.964687621265037, + "grad_norm": 0.5362946391105652, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 3820 + }, + { + "epoch": 2.9724485836243693, + "grad_norm": 0.4724906384944916, + "learning_rate": 0.0002, + "loss": 1.6602, + "step": 3830 + }, + { + "epoch": 2.980209545983702, + "grad_norm": 0.5468461513519287, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 3840 + }, + { + "epoch": 2.9879705083430346, + "grad_norm": 0.4697108864784241, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 3850 + }, + { + "epoch": 2.995731470702367, + "grad_norm": 0.4780906140804291, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 3860 + }, + { + "epoch": 2.9996119518820334, + "eval_loss": 1.8472607135772705, + "eval_runtime": 106.5541, + "eval_samples_per_second": 4.758, + "eval_steps_per_second": 0.601, + "step": 3865 + }, + { + "epoch": 3.0034924330616994, + "grad_norm": 0.5645653605461121, + "learning_rate": 0.0002, + "loss": 1.4983, + "step": 3870 + }, + { + "epoch": 3.0112533954210323, + "grad_norm": 0.6457151174545288, + "learning_rate": 0.0002, + "loss": 1.4334, + "step": 3880 + }, + { + "epoch": 3.0190143577803648, + "grad_norm": 0.583838164806366, + "learning_rate": 0.0002, + "loss": 1.3899, + "step": 3890 + }, + { + "epoch": 3.026775320139697, + "grad_norm": 0.6819260120391846, + "learning_rate": 0.0002, + "loss": 1.3258, + "step": 3900 + }, + { + "epoch": 3.03453628249903, + "grad_norm": 0.6692903637886047, + "learning_rate": 0.0002, + "loss": 1.3458, + "step": 3910 + }, + { + "epoch": 3.0422972448583625, + "grad_norm": 0.6101024746894836, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 3920 + }, + { + "epoch": 3.050058207217695, + "grad_norm": 0.7014093399047852, + "learning_rate": 0.0002, + "loss": 1.394, + "step": 3930 + }, + { + "epoch": 3.0578191695770274, + "grad_norm": 0.7380381226539612, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 3940 + }, + { + "epoch": 3.0655801319363603, + "grad_norm": 0.6607900857925415, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 3950 + }, + { + "epoch": 3.0733410942956927, + "grad_norm": 0.735263466835022, + "learning_rate": 0.0002, + "loss": 1.4293, + "step": 3960 + }, + { + "epoch": 3.081102056655025, + "grad_norm": 0.6788513660430908, + "learning_rate": 0.0002, + "loss": 1.3966, + "step": 3970 + }, + { + "epoch": 3.088863019014358, + "grad_norm": 0.6347652673721313, + "learning_rate": 0.0002, + "loss": 1.3435, + "step": 3980 + }, + { + "epoch": 3.0966239813736904, + "grad_norm": 0.7056642770767212, + "learning_rate": 0.0002, + "loss": 1.4518, + "step": 3990 + }, + { + "epoch": 3.104384943733023, + "grad_norm": 0.6387075185775757, + "learning_rate": 0.0002, + "loss": 1.4474, + "step": 4000 + }, + { + "epoch": 3.1121459060923553, + "grad_norm": 0.6701116561889648, + "learning_rate": 0.0002, + "loss": 1.3833, + "step": 4010 + }, + { + "epoch": 3.119906868451688, + "grad_norm": 0.7558449506759644, + "learning_rate": 0.0002, + "loss": 1.404, + "step": 4020 + }, + { + "epoch": 3.1276678308110206, + "grad_norm": 0.6612881422042847, + "learning_rate": 0.0002, + "loss": 1.3294, + "step": 4030 + }, + { + "epoch": 3.135428793170353, + "grad_norm": 0.7474587559700012, + "learning_rate": 0.0002, + "loss": 1.439, + "step": 4040 + }, + { + "epoch": 3.1431897555296855, + "grad_norm": 0.7292373776435852, + "learning_rate": 0.0002, + "loss": 1.4616, + "step": 4050 + }, + { + "epoch": 3.1509507178890184, + "grad_norm": 0.7432886958122253, + "learning_rate": 0.0002, + "loss": 1.3908, + "step": 4060 + }, + { + "epoch": 3.158711680248351, + "grad_norm": 0.6366098523139954, + "learning_rate": 0.0002, + "loss": 1.4214, + "step": 4070 + }, + { + "epoch": 3.1664726426076832, + "grad_norm": 0.6837611794471741, + "learning_rate": 0.0002, + "loss": 1.5044, + "step": 4080 + }, + { + "epoch": 3.174233604967016, + "grad_norm": 0.7194393277168274, + "learning_rate": 0.0002, + "loss": 1.4332, + "step": 4090 + }, + { + "epoch": 3.1819945673263486, + "grad_norm": 0.6963607668876648, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 4100 + }, + { + "epoch": 3.189755529685681, + "grad_norm": 0.6404902935028076, + "learning_rate": 0.0002, + "loss": 1.4127, + "step": 4110 + }, + { + "epoch": 3.1975164920450134, + "grad_norm": 0.7172070741653442, + "learning_rate": 0.0002, + "loss": 1.4394, + "step": 4120 + }, + { + "epoch": 3.2052774544043463, + "grad_norm": 0.6577759385108948, + "learning_rate": 0.0002, + "loss": 1.4658, + "step": 4130 + }, + { + "epoch": 3.2130384167636787, + "grad_norm": 0.6658480167388916, + "learning_rate": 0.0002, + "loss": 1.4019, + "step": 4140 + }, + { + "epoch": 3.220799379123011, + "grad_norm": 0.6771699786186218, + "learning_rate": 0.0002, + "loss": 1.4348, + "step": 4150 + }, + { + "epoch": 3.2285603414823436, + "grad_norm": 0.699035108089447, + "learning_rate": 0.0002, + "loss": 1.4736, + "step": 4160 + }, + { + "epoch": 3.2363213038416765, + "grad_norm": 0.7218514680862427, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 4170 + }, + { + "epoch": 3.244082266201009, + "grad_norm": 0.6270631551742554, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 4180 + }, + { + "epoch": 3.2518432285603414, + "grad_norm": 0.6828921437263489, + "learning_rate": 0.0002, + "loss": 1.4076, + "step": 4190 + }, + { + "epoch": 3.2596041909196742, + "grad_norm": 0.6005498170852661, + "learning_rate": 0.0002, + "loss": 1.4663, + "step": 4200 + }, + { + "epoch": 3.2673651532790067, + "grad_norm": 0.6974790692329407, + "learning_rate": 0.0002, + "loss": 1.4798, + "step": 4210 + }, + { + "epoch": 3.275126115638339, + "grad_norm": 0.7269543409347534, + "learning_rate": 0.0002, + "loss": 1.5012, + "step": 4220 + }, + { + "epoch": 3.2828870779976715, + "grad_norm": 0.6728787422180176, + "learning_rate": 0.0002, + "loss": 1.3848, + "step": 4230 + }, + { + "epoch": 3.2906480403570044, + "grad_norm": 0.676972508430481, + "learning_rate": 0.0002, + "loss": 1.4112, + "step": 4240 + }, + { + "epoch": 3.298409002716337, + "grad_norm": 0.748309314250946, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 4250 + }, + { + "epoch": 3.3061699650756693, + "grad_norm": 0.6976589560508728, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 4260 + }, + { + "epoch": 3.3139309274350017, + "grad_norm": 0.649780809879303, + "learning_rate": 0.0002, + "loss": 1.3967, + "step": 4270 + }, + { + "epoch": 3.3216918897943346, + "grad_norm": 0.6529902815818787, + "learning_rate": 0.0002, + "loss": 1.327, + "step": 4280 + }, + { + "epoch": 3.329452852153667, + "grad_norm": 0.9273163676261902, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 4290 + }, + { + "epoch": 3.3372138145129995, + "grad_norm": 0.717024028301239, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 4300 + }, + { + "epoch": 3.3449747768723324, + "grad_norm": 0.7914950251579285, + "learning_rate": 0.0002, + "loss": 1.4441, + "step": 4310 + }, + { + "epoch": 3.352735739231665, + "grad_norm": 0.7133203148841858, + "learning_rate": 0.0002, + "loss": 1.432, + "step": 4320 + }, + { + "epoch": 3.3604967015909972, + "grad_norm": 0.7409568428993225, + "learning_rate": 0.0002, + "loss": 1.4662, + "step": 4330 + }, + { + "epoch": 3.3682576639503297, + "grad_norm": 0.6993981003761292, + "learning_rate": 0.0002, + "loss": 1.3992, + "step": 4340 + }, + { + "epoch": 3.3760186263096625, + "grad_norm": 0.7114535570144653, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 4350 + }, + { + "epoch": 3.383779588668995, + "grad_norm": 0.6790860295295715, + "learning_rate": 0.0002, + "loss": 1.4227, + "step": 4360 + }, + { + "epoch": 3.3915405510283274, + "grad_norm": 0.6507849097251892, + "learning_rate": 0.0002, + "loss": 1.4128, + "step": 4370 + }, + { + "epoch": 3.39930151338766, + "grad_norm": 0.5967804193496704, + "learning_rate": 0.0002, + "loss": 1.4559, + "step": 4380 + }, + { + "epoch": 3.4070624757469927, + "grad_norm": 0.6625847816467285, + "learning_rate": 0.0002, + "loss": 1.3687, + "step": 4390 + }, + { + "epoch": 3.414823438106325, + "grad_norm": 0.6736508011817932, + "learning_rate": 0.0002, + "loss": 1.4193, + "step": 4400 + }, + { + "epoch": 3.4225844004656576, + "grad_norm": 0.7870860695838928, + "learning_rate": 0.0002, + "loss": 1.4363, + "step": 4410 + }, + { + "epoch": 3.4303453628249905, + "grad_norm": 0.7205295562744141, + "learning_rate": 0.0002, + "loss": 1.4114, + "step": 4420 + }, + { + "epoch": 3.438106325184323, + "grad_norm": 0.6634634137153625, + "learning_rate": 0.0002, + "loss": 1.4131, + "step": 4430 + }, + { + "epoch": 3.4458672875436553, + "grad_norm": 0.7562733292579651, + "learning_rate": 0.0002, + "loss": 1.4683, + "step": 4440 + }, + { + "epoch": 3.453628249902988, + "grad_norm": 0.6585879921913147, + "learning_rate": 0.0002, + "loss": 1.3486, + "step": 4450 + }, + { + "epoch": 3.4613892122623207, + "grad_norm": 0.6896792054176331, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 4460 + }, + { + "epoch": 3.469150174621653, + "grad_norm": 0.6520342230796814, + "learning_rate": 0.0002, + "loss": 1.4208, + "step": 4470 + }, + { + "epoch": 3.4769111369809855, + "grad_norm": 0.6760806441307068, + "learning_rate": 0.0002, + "loss": 1.3423, + "step": 4480 + }, + { + "epoch": 3.484672099340318, + "grad_norm": 0.7539774179458618, + "learning_rate": 0.0002, + "loss": 1.4398, + "step": 4490 + }, + { + "epoch": 3.492433061699651, + "grad_norm": 0.7409411668777466, + "learning_rate": 0.0002, + "loss": 1.4534, + "step": 4500 + }, + { + "epoch": 3.5001940240589833, + "grad_norm": 0.6876253485679626, + "learning_rate": 0.0002, + "loss": 1.4069, + "step": 4510 + }, + { + "epoch": 3.5079549864183157, + "grad_norm": 0.7028461694717407, + "learning_rate": 0.0002, + "loss": 1.4228, + "step": 4520 + }, + { + "epoch": 3.5157159487776486, + "grad_norm": 0.8056529760360718, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 4530 + }, + { + "epoch": 3.523476911136981, + "grad_norm": 0.711338996887207, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 4540 + }, + { + "epoch": 3.5312378734963135, + "grad_norm": 0.7343552708625793, + "learning_rate": 0.0002, + "loss": 1.5247, + "step": 4550 + }, + { + "epoch": 3.5389988358556463, + "grad_norm": 0.745479941368103, + "learning_rate": 0.0002, + "loss": 1.4308, + "step": 4560 + }, + { + "epoch": 3.5467597982149788, + "grad_norm": 0.7582294940948486, + "learning_rate": 0.0002, + "loss": 1.4229, + "step": 4570 + }, + { + "epoch": 3.554520760574311, + "grad_norm": 0.6717444658279419, + "learning_rate": 0.0002, + "loss": 1.4127, + "step": 4580 + }, + { + "epoch": 3.5622817229336436, + "grad_norm": 0.7417883276939392, + "learning_rate": 0.0002, + "loss": 1.4368, + "step": 4590 + }, + { + "epoch": 3.570042685292976, + "grad_norm": 0.6385737061500549, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 4600 + }, + { + "epoch": 3.577803647652309, + "grad_norm": 0.716704249382019, + "learning_rate": 0.0002, + "loss": 1.3981, + "step": 4610 + }, + { + "epoch": 3.5855646100116414, + "grad_norm": 0.6948980093002319, + "learning_rate": 0.0002, + "loss": 1.3889, + "step": 4620 + }, + { + "epoch": 3.593325572370974, + "grad_norm": 0.6961140036582947, + "learning_rate": 0.0002, + "loss": 1.5177, + "step": 4630 + }, + { + "epoch": 3.6010865347303067, + "grad_norm": 0.7493122220039368, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 4640 + }, + { + "epoch": 3.608847497089639, + "grad_norm": 0.7431658506393433, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 4650 + }, + { + "epoch": 3.6166084594489716, + "grad_norm": 0.8353387713432312, + "learning_rate": 0.0002, + "loss": 1.4551, + "step": 4660 + }, + { + "epoch": 3.6243694218083045, + "grad_norm": 0.7095612287521362, + "learning_rate": 0.0002, + "loss": 1.4533, + "step": 4670 + }, + { + "epoch": 3.632130384167637, + "grad_norm": 0.776620090007782, + "learning_rate": 0.0002, + "loss": 1.4003, + "step": 4680 + }, + { + "epoch": 3.6398913465269693, + "grad_norm": 0.7198925018310547, + "learning_rate": 0.0002, + "loss": 1.4361, + "step": 4690 + }, + { + "epoch": 3.6476523088863018, + "grad_norm": 0.8238834738731384, + "learning_rate": 0.0002, + "loss": 1.4543, + "step": 4700 + }, + { + "epoch": 3.655413271245634, + "grad_norm": 0.6804245710372925, + "learning_rate": 0.0002, + "loss": 1.3958, + "step": 4710 + }, + { + "epoch": 3.663174233604967, + "grad_norm": 0.8444845676422119, + "learning_rate": 0.0002, + "loss": 1.4158, + "step": 4720 + }, + { + "epoch": 3.6709351959642995, + "grad_norm": 0.743797779083252, + "learning_rate": 0.0002, + "loss": 1.3825, + "step": 4730 + }, + { + "epoch": 3.678696158323632, + "grad_norm": 0.8994188904762268, + "learning_rate": 0.0002, + "loss": 1.4213, + "step": 4740 + }, + { + "epoch": 3.686457120682965, + "grad_norm": 0.75416100025177, + "learning_rate": 0.0002, + "loss": 1.4281, + "step": 4750 + }, + { + "epoch": 3.6942180830422973, + "grad_norm": 0.6499266028404236, + "learning_rate": 0.0002, + "loss": 1.4154, + "step": 4760 + }, + { + "epoch": 3.7019790454016297, + "grad_norm": 0.7246791124343872, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 4770 + }, + { + "epoch": 3.7097400077609626, + "grad_norm": 0.7831124067306519, + "learning_rate": 0.0002, + "loss": 1.426, + "step": 4780 + }, + { + "epoch": 3.717500970120295, + "grad_norm": 0.7130028009414673, + "learning_rate": 0.0002, + "loss": 1.3933, + "step": 4790 + }, + { + "epoch": 3.7252619324796274, + "grad_norm": 0.7501602172851562, + "learning_rate": 0.0002, + "loss": 1.4632, + "step": 4800 + }, + { + "epoch": 3.73302289483896, + "grad_norm": 0.6980932950973511, + "learning_rate": 0.0002, + "loss": 1.4985, + "step": 4810 + }, + { + "epoch": 3.7407838571982923, + "grad_norm": 0.8050530552864075, + "learning_rate": 0.0002, + "loss": 1.4517, + "step": 4820 + }, + { + "epoch": 3.748544819557625, + "grad_norm": 0.6385579705238342, + "learning_rate": 0.0002, + "loss": 1.4703, + "step": 4830 + }, + { + "epoch": 3.7563057819169576, + "grad_norm": 0.6664714813232422, + "learning_rate": 0.0002, + "loss": 1.5281, + "step": 4840 + }, + { + "epoch": 3.76406674427629, + "grad_norm": 0.7125676274299622, + "learning_rate": 0.0002, + "loss": 1.4443, + "step": 4850 + }, + { + "epoch": 3.771827706635623, + "grad_norm": 0.7231866717338562, + "learning_rate": 0.0002, + "loss": 1.3958, + "step": 4860 + }, + { + "epoch": 3.7795886689949554, + "grad_norm": 0.6917183995246887, + "learning_rate": 0.0002, + "loss": 1.4446, + "step": 4870 + }, + { + "epoch": 3.787349631354288, + "grad_norm": 0.665037989616394, + "learning_rate": 0.0002, + "loss": 1.4369, + "step": 4880 + }, + { + "epoch": 3.7951105937136207, + "grad_norm": 0.5837726593017578, + "learning_rate": 0.0002, + "loss": 1.4193, + "step": 4890 + }, + { + "epoch": 3.802871556072953, + "grad_norm": 0.6366701722145081, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 4900 + }, + { + "epoch": 3.8106325184322856, + "grad_norm": 0.7082223892211914, + "learning_rate": 0.0002, + "loss": 1.46, + "step": 4910 + }, + { + "epoch": 3.818393480791618, + "grad_norm": 0.8101672530174255, + "learning_rate": 0.0002, + "loss": 1.5139, + "step": 4920 + }, + { + "epoch": 3.826154443150951, + "grad_norm": 0.7516148090362549, + "learning_rate": 0.0002, + "loss": 1.3659, + "step": 4930 + }, + { + "epoch": 3.8339154055102833, + "grad_norm": 0.7928489446640015, + "learning_rate": 0.0002, + "loss": 1.3909, + "step": 4940 + }, + { + "epoch": 3.8416763678696157, + "grad_norm": 0.6892234683036804, + "learning_rate": 0.0002, + "loss": 1.4255, + "step": 4950 + }, + { + "epoch": 3.849437330228948, + "grad_norm": 0.6381304264068604, + "learning_rate": 0.0002, + "loss": 1.5024, + "step": 4960 + }, + { + "epoch": 3.857198292588281, + "grad_norm": 0.8068831562995911, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 4970 + }, + { + "epoch": 3.8649592549476135, + "grad_norm": 0.7289869785308838, + "learning_rate": 0.0002, + "loss": 1.45, + "step": 4980 + }, + { + "epoch": 3.872720217306946, + "grad_norm": 0.7278549075126648, + "learning_rate": 0.0002, + "loss": 1.398, + "step": 4990 + }, + { + "epoch": 3.880481179666279, + "grad_norm": 0.7324236631393433, + "learning_rate": 0.0002, + "loss": 1.4442, + "step": 5000 + }, + { + "epoch": 3.8882421420256112, + "grad_norm": 0.6759871244430542, + "learning_rate": 0.0002, + "loss": 1.4511, + "step": 5010 + }, + { + "epoch": 3.8960031043849437, + "grad_norm": 0.8159207701683044, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 5020 + }, + { + "epoch": 3.9037640667442766, + "grad_norm": 0.6536211967468262, + "learning_rate": 0.0002, + "loss": 1.4685, + "step": 5030 + }, + { + "epoch": 3.911525029103609, + "grad_norm": 0.6827932000160217, + "learning_rate": 0.0002, + "loss": 1.4335, + "step": 5040 + }, + { + "epoch": 3.9192859914629414, + "grad_norm": 0.6688340306282043, + "learning_rate": 0.0002, + "loss": 1.433, + "step": 5050 + }, + { + "epoch": 3.927046953822274, + "grad_norm": 0.6385695934295654, + "learning_rate": 0.0002, + "loss": 1.4099, + "step": 5060 + }, + { + "epoch": 3.9348079161816063, + "grad_norm": 0.6975107192993164, + "learning_rate": 0.0002, + "loss": 1.4767, + "step": 5070 + }, + { + "epoch": 3.942568878540939, + "grad_norm": 0.6684112548828125, + "learning_rate": 0.0002, + "loss": 1.4893, + "step": 5080 + }, + { + "epoch": 3.9503298409002716, + "grad_norm": 0.8349628448486328, + "learning_rate": 0.0002, + "loss": 1.4732, + "step": 5090 + }, + { + "epoch": 3.958090803259604, + "grad_norm": 0.7146425843238831, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 5100 + }, + { + "epoch": 3.965851765618937, + "grad_norm": 0.6555036902427673, + "learning_rate": 0.0002, + "loss": 1.4149, + "step": 5110 + }, + { + "epoch": 3.9736127279782694, + "grad_norm": 0.7037415504455566, + "learning_rate": 0.0002, + "loss": 1.4274, + "step": 5120 + }, + { + "epoch": 3.981373690337602, + "grad_norm": 0.7235575914382935, + "learning_rate": 0.0002, + "loss": 1.4292, + "step": 5130 + }, + { + "epoch": 3.9891346526969347, + "grad_norm": 0.7092325687408447, + "learning_rate": 0.0002, + "loss": 1.4455, + "step": 5140 + }, + { + "epoch": 3.996895615056267, + "grad_norm": 0.7490319609642029, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 5150 + }, + { + "epoch": 4.0, + "eval_loss": 1.9131355285644531, + "eval_runtime": 105.5778, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 5154 + }, + { + "epoch": 4.0046565774155995, + "grad_norm": 0.7075854539871216, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 5160 + }, + { + "epoch": 4.012417539774932, + "grad_norm": 0.9466007351875305, + "learning_rate": 0.0002, + "loss": 1.209, + "step": 5170 + }, + { + "epoch": 4.020178502134264, + "grad_norm": 1.0297044515609741, + "learning_rate": 0.0002, + "loss": 1.2567, + "step": 5180 + }, + { + "epoch": 4.027939464493597, + "grad_norm": 0.7765059471130371, + "learning_rate": 0.0002, + "loss": 1.1796, + "step": 5190 + }, + { + "epoch": 4.03570042685293, + "grad_norm": 0.995760977268219, + "learning_rate": 0.0002, + "loss": 1.2356, + "step": 5200 + }, + { + "epoch": 4.043461389212262, + "grad_norm": 0.8663829565048218, + "learning_rate": 0.0002, + "loss": 1.1792, + "step": 5210 + }, + { + "epoch": 4.051222351571595, + "grad_norm": 1.0660825967788696, + "learning_rate": 0.0002, + "loss": 1.2471, + "step": 5220 + }, + { + "epoch": 4.058983313930927, + "grad_norm": 0.9858174920082092, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 5230 + }, + { + "epoch": 4.06674427629026, + "grad_norm": 0.8911338448524475, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 5240 + }, + { + "epoch": 4.074505238649593, + "grad_norm": 1.0848394632339478, + "learning_rate": 0.0002, + "loss": 1.1858, + "step": 5250 + }, + { + "epoch": 4.082266201008925, + "grad_norm": 1.0849905014038086, + "learning_rate": 0.0002, + "loss": 1.1684, + "step": 5260 + }, + { + "epoch": 4.090027163368258, + "grad_norm": 1.0497841835021973, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 5270 + }, + { + "epoch": 4.0977881257275905, + "grad_norm": 0.8943053483963013, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 5280 + }, + { + "epoch": 4.1055490880869225, + "grad_norm": 0.8432527184486389, + "learning_rate": 0.0002, + "loss": 1.1923, + "step": 5290 + }, + { + "epoch": 4.113310050446255, + "grad_norm": 0.9690414667129517, + "learning_rate": 0.0002, + "loss": 1.1634, + "step": 5300 + }, + { + "epoch": 4.121071012805588, + "grad_norm": 0.7790773510932922, + "learning_rate": 0.0002, + "loss": 1.3019, + "step": 5310 + }, + { + "epoch": 4.12883197516492, + "grad_norm": 0.9289211630821228, + "learning_rate": 0.0002, + "loss": 1.1806, + "step": 5320 + }, + { + "epoch": 4.136592937524253, + "grad_norm": 1.0785125494003296, + "learning_rate": 0.0002, + "loss": 1.1458, + "step": 5330 + }, + { + "epoch": 4.144353899883585, + "grad_norm": 0.8559591770172119, + "learning_rate": 0.0002, + "loss": 1.2086, + "step": 5340 + }, + { + "epoch": 4.152114862242918, + "grad_norm": 0.9405956268310547, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 5350 + }, + { + "epoch": 4.159875824602251, + "grad_norm": 0.9942827820777893, + "learning_rate": 0.0002, + "loss": 1.1793, + "step": 5360 + }, + { + "epoch": 4.167636786961583, + "grad_norm": 0.9141933917999268, + "learning_rate": 0.0002, + "loss": 1.1659, + "step": 5370 + }, + { + "epoch": 4.175397749320916, + "grad_norm": 0.8206015229225159, + "learning_rate": 0.0002, + "loss": 1.1647, + "step": 5380 + }, + { + "epoch": 4.183158711680249, + "grad_norm": 0.9340888857841492, + "learning_rate": 0.0002, + "loss": 1.2778, + "step": 5390 + }, + { + "epoch": 4.190919674039581, + "grad_norm": 1.2122114896774292, + "learning_rate": 0.0002, + "loss": 1.2459, + "step": 5400 + }, + { + "epoch": 4.1986806363989135, + "grad_norm": 1.0661298036575317, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 5410 + }, + { + "epoch": 4.206441598758246, + "grad_norm": 0.9372861385345459, + "learning_rate": 0.0002, + "loss": 1.1978, + "step": 5420 + }, + { + "epoch": 4.214202561117578, + "grad_norm": 0.894012987613678, + "learning_rate": 0.0002, + "loss": 1.2653, + "step": 5430 + }, + { + "epoch": 4.221963523476911, + "grad_norm": 1.0647753477096558, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 5440 + }, + { + "epoch": 4.229724485836243, + "grad_norm": 0.989179790019989, + "learning_rate": 0.0002, + "loss": 1.2231, + "step": 5450 + }, + { + "epoch": 4.237485448195576, + "grad_norm": 1.1601181030273438, + "learning_rate": 0.0002, + "loss": 1.2715, + "step": 5460 + }, + { + "epoch": 4.245246410554909, + "grad_norm": 0.9395585656166077, + "learning_rate": 0.0002, + "loss": 1.2406, + "step": 5470 + }, + { + "epoch": 4.253007372914241, + "grad_norm": 0.9527766108512878, + "learning_rate": 0.0002, + "loss": 1.2779, + "step": 5480 + }, + { + "epoch": 4.260768335273574, + "grad_norm": 1.0319520235061646, + "learning_rate": 0.0002, + "loss": 1.267, + "step": 5490 + }, + { + "epoch": 4.268529297632907, + "grad_norm": 0.8659824728965759, + "learning_rate": 0.0002, + "loss": 1.2633, + "step": 5500 + }, + { + "epoch": 4.276290259992239, + "grad_norm": 1.099211573600769, + "learning_rate": 0.0002, + "loss": 1.1475, + "step": 5510 + }, + { + "epoch": 4.284051222351572, + "grad_norm": 0.9363361597061157, + "learning_rate": 0.0002, + "loss": 1.2508, + "step": 5520 + }, + { + "epoch": 4.2918121847109045, + "grad_norm": 0.8437647223472595, + "learning_rate": 0.0002, + "loss": 1.189, + "step": 5530 + }, + { + "epoch": 4.2995731470702365, + "grad_norm": 0.9181258678436279, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 5540 + }, + { + "epoch": 4.307334109429569, + "grad_norm": 0.9059357643127441, + "learning_rate": 0.0002, + "loss": 1.2092, + "step": 5550 + }, + { + "epoch": 4.315095071788901, + "grad_norm": 0.9337241649627686, + "learning_rate": 0.0002, + "loss": 1.2189, + "step": 5560 + }, + { + "epoch": 4.322856034148234, + "grad_norm": 0.9428889155387878, + "learning_rate": 0.0002, + "loss": 1.2462, + "step": 5570 + }, + { + "epoch": 4.330616996507567, + "grad_norm": 1.003589153289795, + "learning_rate": 0.0002, + "loss": 1.2675, + "step": 5580 + }, + { + "epoch": 4.338377958866899, + "grad_norm": 1.1249268054962158, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 5590 + }, + { + "epoch": 4.346138921226232, + "grad_norm": 0.8623469471931458, + "learning_rate": 0.0002, + "loss": 1.2501, + "step": 5600 + }, + { + "epoch": 4.353899883585565, + "grad_norm": 1.1389174461364746, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 5610 + }, + { + "epoch": 4.361660845944897, + "grad_norm": 1.0136264562606812, + "learning_rate": 0.0002, + "loss": 1.2245, + "step": 5620 + }, + { + "epoch": 4.36942180830423, + "grad_norm": 0.9567070603370667, + "learning_rate": 0.0002, + "loss": 1.3473, + "step": 5630 + }, + { + "epoch": 4.377182770663563, + "grad_norm": 1.0592148303985596, + "learning_rate": 0.0002, + "loss": 1.2988, + "step": 5640 + }, + { + "epoch": 4.384943733022895, + "grad_norm": 1.0110485553741455, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 5650 + }, + { + "epoch": 4.3927046953822275, + "grad_norm": 0.9914907217025757, + "learning_rate": 0.0002, + "loss": 1.2086, + "step": 5660 + }, + { + "epoch": 4.4004656577415595, + "grad_norm": 0.9447247982025146, + "learning_rate": 0.0002, + "loss": 1.2363, + "step": 5670 + }, + { + "epoch": 4.408226620100892, + "grad_norm": 0.9644378423690796, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 5680 + }, + { + "epoch": 4.415987582460225, + "grad_norm": 0.920676589012146, + "learning_rate": 0.0002, + "loss": 1.2773, + "step": 5690 + }, + { + "epoch": 4.423748544819557, + "grad_norm": 1.060570478439331, + "learning_rate": 0.0002, + "loss": 1.2792, + "step": 5700 + }, + { + "epoch": 4.43150950717889, + "grad_norm": 0.8857738971710205, + "learning_rate": 0.0002, + "loss": 1.2374, + "step": 5710 + }, + { + "epoch": 4.439270469538223, + "grad_norm": 1.0536398887634277, + "learning_rate": 0.0002, + "loss": 1.2588, + "step": 5720 + }, + { + "epoch": 4.447031431897555, + "grad_norm": 0.990847110748291, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 5730 + }, + { + "epoch": 4.454792394256888, + "grad_norm": 0.9692499041557312, + "learning_rate": 0.0002, + "loss": 1.2469, + "step": 5740 + }, + { + "epoch": 4.462553356616221, + "grad_norm": 1.0376402139663696, + "learning_rate": 0.0002, + "loss": 1.2269, + "step": 5750 + }, + { + "epoch": 4.470314318975553, + "grad_norm": 1.3863259553909302, + "learning_rate": 0.0002, + "loss": 1.1701, + "step": 5760 + }, + { + "epoch": 4.478075281334886, + "grad_norm": 0.978379487991333, + "learning_rate": 0.0002, + "loss": 1.2591, + "step": 5770 + }, + { + "epoch": 4.485836243694218, + "grad_norm": 1.0973085165023804, + "learning_rate": 0.0002, + "loss": 1.2729, + "step": 5780 + }, + { + "epoch": 4.4935972060535505, + "grad_norm": 1.057006597518921, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 5790 + }, + { + "epoch": 4.501358168412883, + "grad_norm": 0.9247729182243347, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 5800 + }, + { + "epoch": 4.509119130772215, + "grad_norm": 1.0447787046432495, + "learning_rate": 0.0002, + "loss": 1.2369, + "step": 5810 + }, + { + "epoch": 4.516880093131548, + "grad_norm": 1.1930429935455322, + "learning_rate": 0.0002, + "loss": 1.211, + "step": 5820 + }, + { + "epoch": 4.524641055490881, + "grad_norm": 0.9867590069770813, + "learning_rate": 0.0002, + "loss": 1.2596, + "step": 5830 + }, + { + "epoch": 4.532402017850213, + "grad_norm": 0.9591100215911865, + "learning_rate": 0.0002, + "loss": 1.2766, + "step": 5840 + }, + { + "epoch": 4.540162980209546, + "grad_norm": 0.9950753450393677, + "learning_rate": 0.0002, + "loss": 1.2154, + "step": 5850 + }, + { + "epoch": 4.547923942568879, + "grad_norm": 1.0087506771087646, + "learning_rate": 0.0002, + "loss": 1.2149, + "step": 5860 + }, + { + "epoch": 4.555684904928211, + "grad_norm": 1.0934417247772217, + "learning_rate": 0.0002, + "loss": 1.3165, + "step": 5870 + }, + { + "epoch": 4.563445867287544, + "grad_norm": 1.107987403869629, + "learning_rate": 0.0002, + "loss": 1.3059, + "step": 5880 + }, + { + "epoch": 4.571206829646876, + "grad_norm": 0.9147276878356934, + "learning_rate": 0.0002, + "loss": 1.2184, + "step": 5890 + }, + { + "epoch": 4.578967792006209, + "grad_norm": 1.036780595779419, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 5900 + }, + { + "epoch": 4.5867287543655415, + "grad_norm": 0.9284719824790955, + "learning_rate": 0.0002, + "loss": 1.2209, + "step": 5910 + }, + { + "epoch": 4.5944897167248735, + "grad_norm": 0.9141898155212402, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 5920 + }, + { + "epoch": 4.602250679084206, + "grad_norm": 1.0447357892990112, + "learning_rate": 0.0002, + "loss": 1.2319, + "step": 5930 + }, + { + "epoch": 4.610011641443539, + "grad_norm": 0.9309114217758179, + "learning_rate": 0.0002, + "loss": 1.2667, + "step": 5940 + }, + { + "epoch": 4.617772603802871, + "grad_norm": 1.2986129522323608, + "learning_rate": 0.0002, + "loss": 1.2827, + "step": 5950 + }, + { + "epoch": 4.625533566162204, + "grad_norm": 0.9221704602241516, + "learning_rate": 0.0002, + "loss": 1.312, + "step": 5960 + }, + { + "epoch": 4.633294528521537, + "grad_norm": 0.9228187799453735, + "learning_rate": 0.0002, + "loss": 1.2769, + "step": 5970 + }, + { + "epoch": 4.641055490880869, + "grad_norm": 0.9483116269111633, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 5980 + }, + { + "epoch": 4.648816453240202, + "grad_norm": 1.0218974351882935, + "learning_rate": 0.0002, + "loss": 1.3437, + "step": 5990 + }, + { + "epoch": 4.656577415599534, + "grad_norm": 0.9764600396156311, + "learning_rate": 0.0002, + "loss": 1.3085, + "step": 6000 + }, + { + "epoch": 4.664338377958867, + "grad_norm": 0.9115710258483887, + "learning_rate": 0.0002, + "loss": 1.197, + "step": 6010 + }, + { + "epoch": 4.6720993403182, + "grad_norm": 0.9245651364326477, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 6020 + }, + { + "epoch": 4.6798603026775325, + "grad_norm": 0.9686311483383179, + "learning_rate": 0.0002, + "loss": 1.2969, + "step": 6030 + }, + { + "epoch": 4.6876212650368645, + "grad_norm": 1.1807392835617065, + "learning_rate": 0.0002, + "loss": 1.2702, + "step": 6040 + }, + { + "epoch": 4.695382227396197, + "grad_norm": 1.0358641147613525, + "learning_rate": 0.0002, + "loss": 1.328, + "step": 6050 + }, + { + "epoch": 4.703143189755529, + "grad_norm": 0.987332284450531, + "learning_rate": 0.0002, + "loss": 1.3281, + "step": 6060 + }, + { + "epoch": 4.710904152114862, + "grad_norm": 1.0526494979858398, + "learning_rate": 0.0002, + "loss": 1.2514, + "step": 6070 + }, + { + "epoch": 4.718665114474195, + "grad_norm": 1.0276758670806885, + "learning_rate": 0.0002, + "loss": 1.2246, + "step": 6080 + }, + { + "epoch": 4.726426076833527, + "grad_norm": 0.9904406666755676, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 6090 + }, + { + "epoch": 4.73418703919286, + "grad_norm": 1.0084882974624634, + "learning_rate": 0.0002, + "loss": 1.2797, + "step": 6100 + }, + { + "epoch": 4.741948001552192, + "grad_norm": 0.8646450638771057, + "learning_rate": 0.0002, + "loss": 1.2656, + "step": 6110 + }, + { + "epoch": 4.749708963911525, + "grad_norm": 0.9233377575874329, + "learning_rate": 0.0002, + "loss": 1.3063, + "step": 6120 + }, + { + "epoch": 4.757469926270858, + "grad_norm": 0.9675140976905823, + "learning_rate": 0.0002, + "loss": 1.2642, + "step": 6130 + }, + { + "epoch": 4.765230888630191, + "grad_norm": 0.9639796018600464, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 6140 + }, + { + "epoch": 4.772991850989523, + "grad_norm": 0.925199568271637, + "learning_rate": 0.0002, + "loss": 1.276, + "step": 6150 + }, + { + "epoch": 4.7807528133488555, + "grad_norm": 1.050901174545288, + "learning_rate": 0.0002, + "loss": 1.2441, + "step": 6160 + }, + { + "epoch": 4.7885137757081875, + "grad_norm": 0.8920623660087585, + "learning_rate": 0.0002, + "loss": 1.301, + "step": 6170 + }, + { + "epoch": 4.79627473806752, + "grad_norm": 0.8964757919311523, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 6180 + }, + { + "epoch": 4.804035700426853, + "grad_norm": 1.0839070081710815, + "learning_rate": 0.0002, + "loss": 1.2787, + "step": 6190 + }, + { + "epoch": 4.811796662786185, + "grad_norm": 0.8809942007064819, + "learning_rate": 0.0002, + "loss": 1.2664, + "step": 6200 + }, + { + "epoch": 4.819557625145518, + "grad_norm": 1.0216195583343506, + "learning_rate": 0.0002, + "loss": 1.321, + "step": 6210 + }, + { + "epoch": 4.827318587504851, + "grad_norm": 0.892005980014801, + "learning_rate": 0.0002, + "loss": 1.3033, + "step": 6220 + }, + { + "epoch": 4.835079549864183, + "grad_norm": 0.9957166910171509, + "learning_rate": 0.0002, + "loss": 1.2602, + "step": 6230 + }, + { + "epoch": 4.842840512223516, + "grad_norm": 0.9720533490180969, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 6240 + }, + { + "epoch": 4.850601474582849, + "grad_norm": 0.9336182475090027, + "learning_rate": 0.0002, + "loss": 1.2651, + "step": 6250 + }, + { + "epoch": 4.858362436942181, + "grad_norm": 1.2611457109451294, + "learning_rate": 0.0002, + "loss": 1.3136, + "step": 6260 + }, + { + "epoch": 4.866123399301514, + "grad_norm": 0.8927203416824341, + "learning_rate": 0.0002, + "loss": 1.2234, + "step": 6270 + }, + { + "epoch": 4.873884361660846, + "grad_norm": 0.9706710577011108, + "learning_rate": 0.0002, + "loss": 1.3463, + "step": 6280 + }, + { + "epoch": 4.8816453240201785, + "grad_norm": 1.1461690664291382, + "learning_rate": 0.0002, + "loss": 1.3209, + "step": 6290 + }, + { + "epoch": 4.889406286379511, + "grad_norm": 0.9930381178855896, + "learning_rate": 0.0002, + "loss": 1.2566, + "step": 6300 + }, + { + "epoch": 4.897167248738843, + "grad_norm": 0.91451096534729, + "learning_rate": 0.0002, + "loss": 1.2568, + "step": 6310 + }, + { + "epoch": 4.904928211098176, + "grad_norm": 1.0319571495056152, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 6320 + }, + { + "epoch": 4.912689173457509, + "grad_norm": 0.990140438079834, + "learning_rate": 0.0002, + "loss": 1.2908, + "step": 6330 + }, + { + "epoch": 4.920450135816841, + "grad_norm": 1.2466117143630981, + "learning_rate": 0.0002, + "loss": 1.3299, + "step": 6340 + }, + { + "epoch": 4.928211098176174, + "grad_norm": 1.0316979885101318, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 6350 + }, + { + "epoch": 4.935972060535507, + "grad_norm": 1.0643759965896606, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 6360 + }, + { + "epoch": 4.943733022894839, + "grad_norm": 0.9703279733657837, + "learning_rate": 0.0002, + "loss": 1.2559, + "step": 6370 + }, + { + "epoch": 4.951493985254172, + "grad_norm": 0.9767927527427673, + "learning_rate": 0.0002, + "loss": 1.2155, + "step": 6380 + }, + { + "epoch": 4.959254947613504, + "grad_norm": 0.960854172706604, + "learning_rate": 0.0002, + "loss": 1.2437, + "step": 6390 + }, + { + "epoch": 4.967015909972837, + "grad_norm": 0.9922910332679749, + "learning_rate": 0.0002, + "loss": 1.3314, + "step": 6400 + }, + { + "epoch": 4.9747768723321695, + "grad_norm": 0.956470787525177, + "learning_rate": 0.0002, + "loss": 1.3018, + "step": 6410 + }, + { + "epoch": 4.9825378346915015, + "grad_norm": 0.9637242555618286, + "learning_rate": 0.0002, + "loss": 1.2794, + "step": 6420 + }, + { + "epoch": 4.990298797050834, + "grad_norm": 1.0855202674865723, + "learning_rate": 0.0002, + "loss": 1.3236, + "step": 6430 + }, + { + "epoch": 4.998059759410167, + "grad_norm": 0.9655316472053528, + "learning_rate": 0.0002, + "loss": 1.3015, + "step": 6440 + }, + { + "epoch": 4.9996119518820334, + "eval_loss": 2.0410802364349365, + "eval_runtime": 113.04, + "eval_samples_per_second": 4.485, + "eval_steps_per_second": 0.566, + "step": 6442 + }, + { + "epoch": 5.005820721769499, + "grad_norm": 1.1676199436187744, + "learning_rate": 0.0002, + "loss": 1.0846, + "step": 6450 + }, + { + "epoch": 5.013581684128832, + "grad_norm": 1.4317965507507324, + "learning_rate": 0.0002, + "loss": 1.041, + "step": 6460 + }, + { + "epoch": 5.021342646488165, + "grad_norm": 1.460443377494812, + "learning_rate": 0.0002, + "loss": 0.9546, + "step": 6470 + }, + { + "epoch": 5.029103608847497, + "grad_norm": 1.2299214601516724, + "learning_rate": 0.0002, + "loss": 1.0014, + "step": 6480 + }, + { + "epoch": 5.03686457120683, + "grad_norm": 1.3125724792480469, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 6490 + }, + { + "epoch": 5.044625533566162, + "grad_norm": 1.1252319812774658, + "learning_rate": 0.0002, + "loss": 1.0134, + "step": 6500 + }, + { + "epoch": 5.052386495925495, + "grad_norm": 0.9970866441726685, + "learning_rate": 0.0002, + "loss": 0.976, + "step": 6510 + }, + { + "epoch": 5.060147458284828, + "grad_norm": 1.229069709777832, + "learning_rate": 0.0002, + "loss": 0.9731, + "step": 6520 + }, + { + "epoch": 5.06790842064416, + "grad_norm": 1.2430938482284546, + "learning_rate": 0.0002, + "loss": 1.0498, + "step": 6530 + }, + { + "epoch": 5.0756693830034925, + "grad_norm": 1.0522737503051758, + "learning_rate": 0.0002, + "loss": 1.0236, + "step": 6540 + }, + { + "epoch": 5.083430345362825, + "grad_norm": 1.108890175819397, + "learning_rate": 0.0002, + "loss": 1.0221, + "step": 6550 + }, + { + "epoch": 5.091191307722157, + "grad_norm": 1.156912922859192, + "learning_rate": 0.0002, + "loss": 1.0177, + "step": 6560 + }, + { + "epoch": 5.09895227008149, + "grad_norm": 1.405895709991455, + "learning_rate": 0.0002, + "loss": 1.0415, + "step": 6570 + }, + { + "epoch": 5.106713232440823, + "grad_norm": 1.2005155086517334, + "learning_rate": 0.0002, + "loss": 0.9811, + "step": 6580 + }, + { + "epoch": 5.114474194800155, + "grad_norm": 1.181443452835083, + "learning_rate": 0.0002, + "loss": 0.9862, + "step": 6590 + }, + { + "epoch": 5.122235157159488, + "grad_norm": 2.3444771766662598, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 6600 + }, + { + "epoch": 5.12999611951882, + "grad_norm": 1.216988444328308, + "learning_rate": 0.0002, + "loss": 1.0455, + "step": 6610 + }, + { + "epoch": 5.137757081878153, + "grad_norm": 1.369553565979004, + "learning_rate": 0.0002, + "loss": 1.0549, + "step": 6620 + }, + { + "epoch": 5.145518044237486, + "grad_norm": 1.177964687347412, + "learning_rate": 0.0002, + "loss": 1.0056, + "step": 6630 + }, + { + "epoch": 5.153279006596818, + "grad_norm": 1.1397041082382202, + "learning_rate": 0.0002, + "loss": 1.1025, + "step": 6640 + }, + { + "epoch": 5.161039968956151, + "grad_norm": 1.3976861238479614, + "learning_rate": 0.0002, + "loss": 1.0437, + "step": 6650 + }, + { + "epoch": 5.1688009313154835, + "grad_norm": 1.4824495315551758, + "learning_rate": 0.0002, + "loss": 1.0454, + "step": 6660 + }, + { + "epoch": 5.1765618936748155, + "grad_norm": 1.2653018236160278, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 6670 + }, + { + "epoch": 5.184322856034148, + "grad_norm": 1.3106069564819336, + "learning_rate": 0.0002, + "loss": 0.9971, + "step": 6680 + }, + { + "epoch": 5.192083818393481, + "grad_norm": 1.3140279054641724, + "learning_rate": 0.0002, + "loss": 1.0561, + "step": 6690 + }, + { + "epoch": 5.199844780752813, + "grad_norm": 1.3900256156921387, + "learning_rate": 0.0002, + "loss": 1.0618, + "step": 6700 + }, + { + "epoch": 5.207605743112146, + "grad_norm": 1.3191124200820923, + "learning_rate": 0.0002, + "loss": 1.0285, + "step": 6710 + }, + { + "epoch": 5.215366705471478, + "grad_norm": 1.176107406616211, + "learning_rate": 0.0002, + "loss": 0.9921, + "step": 6720 + }, + { + "epoch": 5.223127667830811, + "grad_norm": 1.2364883422851562, + "learning_rate": 0.0002, + "loss": 1.064, + "step": 6730 + }, + { + "epoch": 5.230888630190144, + "grad_norm": 1.343022108078003, + "learning_rate": 0.0002, + "loss": 0.9599, + "step": 6740 + }, + { + "epoch": 5.238649592549476, + "grad_norm": 1.2826898097991943, + "learning_rate": 0.0002, + "loss": 1.0342, + "step": 6750 + }, + { + "epoch": 5.246410554908809, + "grad_norm": 1.500257134437561, + "learning_rate": 0.0002, + "loss": 1.0703, + "step": 6760 + }, + { + "epoch": 5.254171517268142, + "grad_norm": 1.2605743408203125, + "learning_rate": 0.0002, + "loss": 1.0114, + "step": 6770 + }, + { + "epoch": 5.261932479627474, + "grad_norm": 1.2355525493621826, + "learning_rate": 0.0002, + "loss": 1.0825, + "step": 6780 + }, + { + "epoch": 5.2696934419868064, + "grad_norm": 1.2845789194107056, + "learning_rate": 0.0002, + "loss": 1.0436, + "step": 6790 + }, + { + "epoch": 5.277454404346139, + "grad_norm": 1.3696625232696533, + "learning_rate": 0.0002, + "loss": 0.989, + "step": 6800 + }, + { + "epoch": 5.285215366705471, + "grad_norm": 1.4051260948181152, + "learning_rate": 0.0002, + "loss": 1.0991, + "step": 6810 + }, + { + "epoch": 5.292976329064804, + "grad_norm": 1.266725778579712, + "learning_rate": 0.0002, + "loss": 1.0987, + "step": 6820 + }, + { + "epoch": 5.300737291424136, + "grad_norm": 1.3475236892700195, + "learning_rate": 0.0002, + "loss": 1.0489, + "step": 6830 + }, + { + "epoch": 5.308498253783469, + "grad_norm": 1.54409921169281, + "learning_rate": 0.0002, + "loss": 1.0264, + "step": 6840 + }, + { + "epoch": 5.316259216142802, + "grad_norm": 1.2391985654830933, + "learning_rate": 0.0002, + "loss": 1.033, + "step": 6850 + }, + { + "epoch": 5.324020178502134, + "grad_norm": 1.2435699701309204, + "learning_rate": 0.0002, + "loss": 1.1058, + "step": 6860 + }, + { + "epoch": 5.331781140861467, + "grad_norm": 1.8803037405014038, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 6870 + }, + { + "epoch": 5.3395421032208, + "grad_norm": 1.4195542335510254, + "learning_rate": 0.0002, + "loss": 0.997, + "step": 6880 + }, + { + "epoch": 5.347303065580132, + "grad_norm": 1.1853394508361816, + "learning_rate": 0.0002, + "loss": 1.0273, + "step": 6890 + }, + { + "epoch": 5.355064027939465, + "grad_norm": 1.4016530513763428, + "learning_rate": 0.0002, + "loss": 1.0668, + "step": 6900 + }, + { + "epoch": 5.3628249902987974, + "grad_norm": 1.294339895248413, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 6910 + }, + { + "epoch": 5.370585952658129, + "grad_norm": 1.2952708005905151, + "learning_rate": 0.0002, + "loss": 1.0724, + "step": 6920 + }, + { + "epoch": 5.378346915017462, + "grad_norm": 1.1361510753631592, + "learning_rate": 0.0002, + "loss": 1.0098, + "step": 6930 + }, + { + "epoch": 5.386107877376794, + "grad_norm": 1.125805377960205, + "learning_rate": 0.0002, + "loss": 1.0796, + "step": 6940 + }, + { + "epoch": 5.393868839736127, + "grad_norm": 1.1453300714492798, + "learning_rate": 0.0002, + "loss": 1.122, + "step": 6950 + }, + { + "epoch": 5.40162980209546, + "grad_norm": 1.4542768001556396, + "learning_rate": 0.0002, + "loss": 1.0977, + "step": 6960 + }, + { + "epoch": 5.409390764454792, + "grad_norm": 1.2360988855361938, + "learning_rate": 0.0002, + "loss": 1.0825, + "step": 6970 + }, + { + "epoch": 5.417151726814125, + "grad_norm": 1.2182754278182983, + "learning_rate": 0.0002, + "loss": 1.0631, + "step": 6980 + }, + { + "epoch": 5.424912689173458, + "grad_norm": 1.2018693685531616, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 6990 + }, + { + "epoch": 5.43267365153279, + "grad_norm": 1.346124291419983, + "learning_rate": 0.0002, + "loss": 1.108, + "step": 7000 + }, + { + "epoch": 5.440434613892123, + "grad_norm": 1.2534189224243164, + "learning_rate": 0.0002, + "loss": 1.0534, + "step": 7010 + }, + { + "epoch": 5.448195576251456, + "grad_norm": 1.2033339738845825, + "learning_rate": 0.0002, + "loss": 1.0696, + "step": 7020 + }, + { + "epoch": 5.4559565386107876, + "grad_norm": 1.2788134813308716, + "learning_rate": 0.0002, + "loss": 1.0714, + "step": 7030 + }, + { + "epoch": 5.46371750097012, + "grad_norm": 1.2751542329788208, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 7040 + }, + { + "epoch": 5.471478463329452, + "grad_norm": 1.3237019777297974, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 7050 + }, + { + "epoch": 5.479239425688785, + "grad_norm": 1.4932852983474731, + "learning_rate": 0.0002, + "loss": 1.1081, + "step": 7060 + }, + { + "epoch": 5.487000388048118, + "grad_norm": 1.4003876447677612, + "learning_rate": 0.0002, + "loss": 1.0197, + "step": 7070 + }, + { + "epoch": 5.49476135040745, + "grad_norm": 1.404799461364746, + "learning_rate": 0.0002, + "loss": 1.0662, + "step": 7080 + }, + { + "epoch": 5.502522312766783, + "grad_norm": 1.4486982822418213, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 7090 + }, + { + "epoch": 5.510283275126116, + "grad_norm": 1.1713480949401855, + "learning_rate": 0.0002, + "loss": 1.0645, + "step": 7100 + }, + { + "epoch": 5.518044237485448, + "grad_norm": 1.4062601327896118, + "learning_rate": 0.0002, + "loss": 1.006, + "step": 7110 + }, + { + "epoch": 5.525805199844781, + "grad_norm": 1.211629867553711, + "learning_rate": 0.0002, + "loss": 1.0459, + "step": 7120 + }, + { + "epoch": 5.533566162204114, + "grad_norm": 1.2523176670074463, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 7130 + }, + { + "epoch": 5.541327124563446, + "grad_norm": 1.4467198848724365, + "learning_rate": 0.0002, + "loss": 1.1132, + "step": 7140 + }, + { + "epoch": 5.5490880869227786, + "grad_norm": 1.5961614847183228, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 7150 + }, + { + "epoch": 5.5568490492821105, + "grad_norm": 1.320656418800354, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 7160 + }, + { + "epoch": 5.564610011641443, + "grad_norm": 1.2423332929611206, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 7170 + }, + { + "epoch": 5.572370974000776, + "grad_norm": 1.2919669151306152, + "learning_rate": 0.0002, + "loss": 1.0046, + "step": 7180 + }, + { + "epoch": 5.580131936360108, + "grad_norm": 1.1678385734558105, + "learning_rate": 0.0002, + "loss": 1.046, + "step": 7190 + }, + { + "epoch": 5.587892898719441, + "grad_norm": 1.4250764846801758, + "learning_rate": 0.0002, + "loss": 1.1011, + "step": 7200 + }, + { + "epoch": 5.595653861078774, + "grad_norm": 1.5308716297149658, + "learning_rate": 0.0002, + "loss": 1.1254, + "step": 7210 + }, + { + "epoch": 5.603414823438106, + "grad_norm": 1.2678815126419067, + "learning_rate": 0.0002, + "loss": 1.121, + "step": 7220 + }, + { + "epoch": 5.611175785797439, + "grad_norm": 1.127856969833374, + "learning_rate": 0.0002, + "loss": 1.0846, + "step": 7230 + }, + { + "epoch": 5.618936748156772, + "grad_norm": 1.3832560777664185, + "learning_rate": 0.0002, + "loss": 1.0647, + "step": 7240 + }, + { + "epoch": 5.626697710516104, + "grad_norm": 1.3226919174194336, + "learning_rate": 0.0002, + "loss": 1.0658, + "step": 7250 + }, + { + "epoch": 5.634458672875437, + "grad_norm": 1.3418006896972656, + "learning_rate": 0.0002, + "loss": 1.1175, + "step": 7260 + }, + { + "epoch": 5.642219635234769, + "grad_norm": 1.2625300884246826, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 7270 + }, + { + "epoch": 5.6499805975941015, + "grad_norm": 1.1579464673995972, + "learning_rate": 0.0002, + "loss": 1.067, + "step": 7280 + }, + { + "epoch": 5.657741559953434, + "grad_norm": 1.4998650550842285, + "learning_rate": 0.0002, + "loss": 1.0447, + "step": 7290 + }, + { + "epoch": 5.665502522312766, + "grad_norm": 1.2670758962631226, + "learning_rate": 0.0002, + "loss": 1.1256, + "step": 7300 + }, + { + "epoch": 5.673263484672099, + "grad_norm": 1.2959760427474976, + "learning_rate": 0.0002, + "loss": 1.1267, + "step": 7310 + }, + { + "epoch": 5.681024447031432, + "grad_norm": 1.2460671663284302, + "learning_rate": 0.0002, + "loss": 1.1387, + "step": 7320 + }, + { + "epoch": 5.688785409390764, + "grad_norm": 1.1313989162445068, + "learning_rate": 0.0002, + "loss": 1.0756, + "step": 7330 + }, + { + "epoch": 5.696546371750097, + "grad_norm": 1.282527208328247, + "learning_rate": 0.0002, + "loss": 1.0618, + "step": 7340 + }, + { + "epoch": 5.70430733410943, + "grad_norm": 1.3380206823349, + "learning_rate": 0.0002, + "loss": 1.1315, + "step": 7350 + }, + { + "epoch": 5.712068296468762, + "grad_norm": 1.1648279428482056, + "learning_rate": 0.0002, + "loss": 1.0949, + "step": 7360 + }, + { + "epoch": 5.719829258828095, + "grad_norm": 1.3059816360473633, + "learning_rate": 0.0002, + "loss": 1.1705, + "step": 7370 + }, + { + "epoch": 5.727590221187427, + "grad_norm": 1.1905046701431274, + "learning_rate": 0.0002, + "loss": 1.1496, + "step": 7380 + }, + { + "epoch": 5.73535118354676, + "grad_norm": 1.4089630842208862, + "learning_rate": 0.0002, + "loss": 1.1356, + "step": 7390 + }, + { + "epoch": 5.7431121459060925, + "grad_norm": 1.256721019744873, + "learning_rate": 0.0002, + "loss": 1.1349, + "step": 7400 + }, + { + "epoch": 5.7508731082654245, + "grad_norm": 1.1915162801742554, + "learning_rate": 0.0002, + "loss": 1.0682, + "step": 7410 + }, + { + "epoch": 5.758634070624757, + "grad_norm": 1.1935480833053589, + "learning_rate": 0.0002, + "loss": 1.1257, + "step": 7420 + }, + { + "epoch": 5.76639503298409, + "grad_norm": 1.1761008501052856, + "learning_rate": 0.0002, + "loss": 1.1348, + "step": 7430 + }, + { + "epoch": 5.774155995343422, + "grad_norm": 1.2540549039840698, + "learning_rate": 0.0002, + "loss": 1.0837, + "step": 7440 + }, + { + "epoch": 5.781916957702755, + "grad_norm": 1.5295120477676392, + "learning_rate": 0.0002, + "loss": 1.1527, + "step": 7450 + }, + { + "epoch": 5.789677920062088, + "grad_norm": 1.1081160306930542, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 7460 + }, + { + "epoch": 5.79743888242142, + "grad_norm": 1.4381253719329834, + "learning_rate": 0.0002, + "loss": 1.1304, + "step": 7470 + }, + { + "epoch": 5.805199844780753, + "grad_norm": 1.3079341650009155, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 7480 + }, + { + "epoch": 5.812960807140085, + "grad_norm": 1.1372792720794678, + "learning_rate": 0.0002, + "loss": 1.0544, + "step": 7490 + }, + { + "epoch": 5.820721769499418, + "grad_norm": 1.3221744298934937, + "learning_rate": 0.0002, + "loss": 1.1622, + "step": 7500 + }, + { + "epoch": 5.828482731858751, + "grad_norm": 1.3436939716339111, + "learning_rate": 0.0002, + "loss": 1.1515, + "step": 7510 + }, + { + "epoch": 5.8362436942180835, + "grad_norm": 1.3916879892349243, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 7520 + }, + { + "epoch": 5.8440046565774155, + "grad_norm": 1.2463704347610474, + "learning_rate": 0.0002, + "loss": 1.0816, + "step": 7530 + }, + { + "epoch": 5.851765618936748, + "grad_norm": 1.097051739692688, + "learning_rate": 0.0002, + "loss": 1.0745, + "step": 7540 + }, + { + "epoch": 5.85952658129608, + "grad_norm": 1.1554739475250244, + "learning_rate": 0.0002, + "loss": 1.1454, + "step": 7550 + }, + { + "epoch": 5.867287543655413, + "grad_norm": 1.2384694814682007, + "learning_rate": 0.0002, + "loss": 1.0953, + "step": 7560 + }, + { + "epoch": 5.875048506014746, + "grad_norm": 1.142815351486206, + "learning_rate": 0.0002, + "loss": 1.1734, + "step": 7570 + }, + { + "epoch": 5.882809468374078, + "grad_norm": 1.3637062311172485, + "learning_rate": 0.0002, + "loss": 1.162, + "step": 7580 + }, + { + "epoch": 5.890570430733411, + "grad_norm": 1.2449073791503906, + "learning_rate": 0.0002, + "loss": 1.0781, + "step": 7590 + }, + { + "epoch": 5.898331393092743, + "grad_norm": 1.358058214187622, + "learning_rate": 0.0002, + "loss": 1.1191, + "step": 7600 + }, + { + "epoch": 5.906092355452076, + "grad_norm": 1.264655351638794, + "learning_rate": 0.0002, + "loss": 1.0779, + "step": 7610 + }, + { + "epoch": 5.913853317811409, + "grad_norm": 1.3186019659042358, + "learning_rate": 0.0002, + "loss": 1.1538, + "step": 7620 + }, + { + "epoch": 5.921614280170742, + "grad_norm": 1.4111460447311401, + "learning_rate": 0.0002, + "loss": 1.1076, + "step": 7630 + }, + { + "epoch": 5.929375242530074, + "grad_norm": 1.1078972816467285, + "learning_rate": 0.0002, + "loss": 1.1765, + "step": 7640 + }, + { + "epoch": 5.9371362048894065, + "grad_norm": 1.2742213010787964, + "learning_rate": 0.0002, + "loss": 1.1305, + "step": 7650 + }, + { + "epoch": 5.9448971672487385, + "grad_norm": 1.3412781953811646, + "learning_rate": 0.0002, + "loss": 1.144, + "step": 7660 + }, + { + "epoch": 5.952658129608071, + "grad_norm": 1.123005986213684, + "learning_rate": 0.0002, + "loss": 1.1642, + "step": 7670 + }, + { + "epoch": 5.960419091967404, + "grad_norm": 1.2203444242477417, + "learning_rate": 0.0002, + "loss": 1.0732, + "step": 7680 + }, + { + "epoch": 5.968180054326736, + "grad_norm": 1.341011643409729, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 7690 + }, + { + "epoch": 5.975941016686069, + "grad_norm": 1.2689454555511475, + "learning_rate": 0.0002, + "loss": 1.1144, + "step": 7700 + }, + { + "epoch": 5.983701979045401, + "grad_norm": 1.1518112421035767, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 7710 + }, + { + "epoch": 5.991462941404734, + "grad_norm": 1.3698320388793945, + "learning_rate": 0.0002, + "loss": 1.1868, + "step": 7720 + }, + { + "epoch": 5.999223903764067, + "grad_norm": 1.2812788486480713, + "learning_rate": 0.0002, + "loss": 1.0651, + "step": 7730 + }, + { + "epoch": 6.0, + "eval_loss": 2.252762794494629, + "eval_runtime": 114.8471, + "eval_samples_per_second": 4.415, + "eval_steps_per_second": 0.557, + "step": 7731 + }, + { + "epoch": 6.006984866123399, + "grad_norm": 1.8642009496688843, + "learning_rate": 0.0002, + "loss": 0.8629, + "step": 7740 + }, + { + "epoch": 6.014745828482732, + "grad_norm": 1.7081232070922852, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 7750 + }, + { + "epoch": 6.022506790842065, + "grad_norm": 1.6233899593353271, + "learning_rate": 0.0002, + "loss": 0.7729, + "step": 7760 + }, + { + "epoch": 6.030267753201397, + "grad_norm": 1.5111888647079468, + "learning_rate": 0.0002, + "loss": 0.7907, + "step": 7770 + }, + { + "epoch": 6.0380287155607295, + "grad_norm": 1.5278418064117432, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 7780 + }, + { + "epoch": 6.045789677920062, + "grad_norm": 1.5932185649871826, + "learning_rate": 0.0002, + "loss": 0.835, + "step": 7790 + }, + { + "epoch": 6.053550640279394, + "grad_norm": 1.5990597009658813, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 7800 + }, + { + "epoch": 6.061311602638727, + "grad_norm": 1.7498669624328613, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 7810 + }, + { + "epoch": 6.06907256499806, + "grad_norm": 1.6105555295944214, + "learning_rate": 0.0002, + "loss": 0.8069, + "step": 7820 + }, + { + "epoch": 6.076833527357392, + "grad_norm": 1.5214293003082275, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 7830 + }, + { + "epoch": 6.084594489716725, + "grad_norm": 1.6586973667144775, + "learning_rate": 0.0002, + "loss": 0.8328, + "step": 7840 + }, + { + "epoch": 6.092355452076057, + "grad_norm": 1.467391848564148, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 7850 + }, + { + "epoch": 6.10011641443539, + "grad_norm": 1.537361741065979, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 7860 + }, + { + "epoch": 6.107877376794723, + "grad_norm": 1.621764898300171, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 7870 + }, + { + "epoch": 6.115638339154055, + "grad_norm": 1.583751916885376, + "learning_rate": 0.0002, + "loss": 0.8556, + "step": 7880 + }, + { + "epoch": 6.123399301513388, + "grad_norm": 1.6199619770050049, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 7890 + }, + { + "epoch": 6.1311602638727205, + "grad_norm": 1.6163095235824585, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 7900 + }, + { + "epoch": 6.1389212262320525, + "grad_norm": 1.6120976209640503, + "learning_rate": 0.0002, + "loss": 0.8203, + "step": 7910 + }, + { + "epoch": 6.146682188591385, + "grad_norm": 1.7886850833892822, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 7920 + }, + { + "epoch": 6.154443150950718, + "grad_norm": 1.408303141593933, + "learning_rate": 0.0002, + "loss": 0.7808, + "step": 7930 + }, + { + "epoch": 6.16220411331005, + "grad_norm": 1.6048113107681274, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 7940 + }, + { + "epoch": 6.169965075669383, + "grad_norm": 1.424306869506836, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 7950 + }, + { + "epoch": 6.177726038028716, + "grad_norm": 1.4453672170639038, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 7960 + }, + { + "epoch": 6.185487000388048, + "grad_norm": 1.3157061338424683, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 7970 + }, + { + "epoch": 6.193247962747381, + "grad_norm": 1.330541729927063, + "learning_rate": 0.0002, + "loss": 0.891, + "step": 7980 + }, + { + "epoch": 6.201008925106713, + "grad_norm": 1.6306229829788208, + "learning_rate": 0.0002, + "loss": 0.8599, + "step": 7990 + }, + { + "epoch": 6.208769887466046, + "grad_norm": 1.6332136392593384, + "learning_rate": 0.0002, + "loss": 0.9069, + "step": 8000 + }, + { + "epoch": 6.216530849825379, + "grad_norm": 1.708613395690918, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 8010 + }, + { + "epoch": 6.224291812184711, + "grad_norm": 1.6637346744537354, + "learning_rate": 0.0002, + "loss": 0.8509, + "step": 8020 + }, + { + "epoch": 6.2320527745440435, + "grad_norm": 1.5675315856933594, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 8030 + }, + { + "epoch": 6.239813736903376, + "grad_norm": 1.5826327800750732, + "learning_rate": 0.0002, + "loss": 0.8491, + "step": 8040 + }, + { + "epoch": 6.247574699262708, + "grad_norm": 1.7382984161376953, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 8050 + }, + { + "epoch": 6.255335661622041, + "grad_norm": 1.5272295475006104, + "learning_rate": 0.0002, + "loss": 0.8795, + "step": 8060 + }, + { + "epoch": 6.263096623981374, + "grad_norm": 1.8195022344589233, + "learning_rate": 0.0002, + "loss": 0.8745, + "step": 8070 + }, + { + "epoch": 6.270857586340706, + "grad_norm": 1.679901361465454, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 8080 + }, + { + "epoch": 6.278618548700039, + "grad_norm": 1.4921348094940186, + "learning_rate": 0.0002, + "loss": 0.9006, + "step": 8090 + }, + { + "epoch": 6.286379511059371, + "grad_norm": 1.4627857208251953, + "learning_rate": 0.0002, + "loss": 0.899, + "step": 8100 + }, + { + "epoch": 6.294140473418704, + "grad_norm": 1.3528631925582886, + "learning_rate": 0.0002, + "loss": 0.8944, + "step": 8110 + }, + { + "epoch": 6.301901435778037, + "grad_norm": 1.6863102912902832, + "learning_rate": 0.0002, + "loss": 0.9355, + "step": 8120 + }, + { + "epoch": 6.309662398137369, + "grad_norm": 1.6178052425384521, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 8130 + }, + { + "epoch": 6.317423360496702, + "grad_norm": 1.7626280784606934, + "learning_rate": 0.0002, + "loss": 0.9182, + "step": 8140 + }, + { + "epoch": 6.3251843228560345, + "grad_norm": 1.7188845872879028, + "learning_rate": 0.0002, + "loss": 0.8886, + "step": 8150 + }, + { + "epoch": 6.3329452852153665, + "grad_norm": 1.5777133703231812, + "learning_rate": 0.0002, + "loss": 0.895, + "step": 8160 + }, + { + "epoch": 6.340706247574699, + "grad_norm": 1.7653207778930664, + "learning_rate": 0.0002, + "loss": 0.9247, + "step": 8170 + }, + { + "epoch": 6.348467209934032, + "grad_norm": 1.6861237287521362, + "learning_rate": 0.0002, + "loss": 0.8003, + "step": 8180 + }, + { + "epoch": 6.356228172293364, + "grad_norm": 1.6318124532699585, + "learning_rate": 0.0002, + "loss": 0.884, + "step": 8190 + }, + { + "epoch": 6.363989134652697, + "grad_norm": 1.6192939281463623, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 8200 + }, + { + "epoch": 6.371750097012029, + "grad_norm": 1.7641773223876953, + "learning_rate": 0.0002, + "loss": 0.8939, + "step": 8210 + }, + { + "epoch": 6.379511059371362, + "grad_norm": 1.6470493078231812, + "learning_rate": 0.0002, + "loss": 0.8582, + "step": 8220 + }, + { + "epoch": 6.387272021730695, + "grad_norm": 1.5898468494415283, + "learning_rate": 0.0002, + "loss": 0.9351, + "step": 8230 + }, + { + "epoch": 6.395032984090027, + "grad_norm": 1.8025981187820435, + "learning_rate": 0.0002, + "loss": 0.9658, + "step": 8240 + }, + { + "epoch": 6.40279394644936, + "grad_norm": 1.7035106420516968, + "learning_rate": 0.0002, + "loss": 0.8953, + "step": 8250 + }, + { + "epoch": 6.410554908808693, + "grad_norm": 1.5968799591064453, + "learning_rate": 0.0002, + "loss": 0.9193, + "step": 8260 + }, + { + "epoch": 6.418315871168025, + "grad_norm": 1.7492800951004028, + "learning_rate": 0.0002, + "loss": 0.929, + "step": 8270 + }, + { + "epoch": 6.4260768335273575, + "grad_norm": 1.6914138793945312, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 8280 + }, + { + "epoch": 6.43383779588669, + "grad_norm": 1.5761380195617676, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 8290 + }, + { + "epoch": 6.441598758246022, + "grad_norm": 1.5164411067962646, + "learning_rate": 0.0002, + "loss": 0.8761, + "step": 8300 + }, + { + "epoch": 6.449359720605355, + "grad_norm": 1.6600215435028076, + "learning_rate": 0.0002, + "loss": 0.88, + "step": 8310 + }, + { + "epoch": 6.457120682964687, + "grad_norm": 1.2477679252624512, + "learning_rate": 0.0002, + "loss": 0.9113, + "step": 8320 + }, + { + "epoch": 6.46488164532402, + "grad_norm": 1.3698599338531494, + "learning_rate": 0.0002, + "loss": 0.8822, + "step": 8330 + }, + { + "epoch": 6.472642607683353, + "grad_norm": 1.4847341775894165, + "learning_rate": 0.0002, + "loss": 0.9295, + "step": 8340 + }, + { + "epoch": 6.480403570042685, + "grad_norm": 1.4713412523269653, + "learning_rate": 0.0002, + "loss": 0.9243, + "step": 8350 + }, + { + "epoch": 6.488164532402018, + "grad_norm": 1.334523320198059, + "learning_rate": 0.0002, + "loss": 0.9102, + "step": 8360 + }, + { + "epoch": 6.495925494761351, + "grad_norm": 2.0054359436035156, + "learning_rate": 0.0002, + "loss": 0.8563, + "step": 8370 + }, + { + "epoch": 6.503686457120683, + "grad_norm": 1.560014247894287, + "learning_rate": 0.0002, + "loss": 0.9759, + "step": 8380 + }, + { + "epoch": 6.511447419480016, + "grad_norm": 1.518526554107666, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 8390 + }, + { + "epoch": 6.5192083818393485, + "grad_norm": 1.3841272592544556, + "learning_rate": 0.0002, + "loss": 0.937, + "step": 8400 + }, + { + "epoch": 6.5269693441986805, + "grad_norm": 1.5191527605056763, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 8410 + }, + { + "epoch": 6.534730306558013, + "grad_norm": 1.5275579690933228, + "learning_rate": 0.0002, + "loss": 0.8899, + "step": 8420 + }, + { + "epoch": 6.542491268917345, + "grad_norm": 1.621590256690979, + "learning_rate": 0.0002, + "loss": 0.9291, + "step": 8430 + }, + { + "epoch": 6.550252231276678, + "grad_norm": 1.7939082384109497, + "learning_rate": 0.0002, + "loss": 0.9011, + "step": 8440 + }, + { + "epoch": 6.558013193636011, + "grad_norm": 1.4542964696884155, + "learning_rate": 0.0002, + "loss": 0.8896, + "step": 8450 + }, + { + "epoch": 6.565774155995343, + "grad_norm": 1.5458455085754395, + "learning_rate": 0.0002, + "loss": 0.9393, + "step": 8460 + }, + { + "epoch": 6.573535118354676, + "grad_norm": 1.550359845161438, + "learning_rate": 0.0002, + "loss": 0.9028, + "step": 8470 + }, + { + "epoch": 6.581296080714009, + "grad_norm": 1.527757167816162, + "learning_rate": 0.0002, + "loss": 0.9271, + "step": 8480 + }, + { + "epoch": 6.589057043073341, + "grad_norm": 1.4683486223220825, + "learning_rate": 0.0002, + "loss": 0.966, + "step": 8490 + }, + { + "epoch": 6.596818005432674, + "grad_norm": 1.5057084560394287, + "learning_rate": 0.0002, + "loss": 0.9079, + "step": 8500 + }, + { + "epoch": 6.604578967792007, + "grad_norm": 1.648289442062378, + "learning_rate": 0.0002, + "loss": 0.9235, + "step": 8510 + }, + { + "epoch": 6.612339930151339, + "grad_norm": 1.578914761543274, + "learning_rate": 0.0002, + "loss": 0.9113, + "step": 8520 + }, + { + "epoch": 6.6201008925106715, + "grad_norm": 1.5064080953598022, + "learning_rate": 0.0002, + "loss": 0.8894, + "step": 8530 + }, + { + "epoch": 6.6278618548700035, + "grad_norm": 1.5717744827270508, + "learning_rate": 0.0002, + "loss": 0.8981, + "step": 8540 + }, + { + "epoch": 6.635622817229336, + "grad_norm": 1.7954767942428589, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 8550 + }, + { + "epoch": 6.643383779588669, + "grad_norm": 1.6172343492507935, + "learning_rate": 0.0002, + "loss": 0.927, + "step": 8560 + }, + { + "epoch": 6.651144741948001, + "grad_norm": 1.6627886295318604, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 8570 + }, + { + "epoch": 6.658905704307334, + "grad_norm": 1.5264919996261597, + "learning_rate": 0.0002, + "loss": 0.959, + "step": 8580 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.609248161315918, + "learning_rate": 0.0002, + "loss": 0.9103, + "step": 8590 + }, + { + "epoch": 6.674427629025999, + "grad_norm": 1.5474581718444824, + "learning_rate": 0.0002, + "loss": 0.9395, + "step": 8600 + }, + { + "epoch": 6.682188591385332, + "grad_norm": 1.6294898986816406, + "learning_rate": 0.0002, + "loss": 0.9018, + "step": 8610 + }, + { + "epoch": 6.689949553744665, + "grad_norm": 1.612615942955017, + "learning_rate": 0.0002, + "loss": 0.9323, + "step": 8620 + }, + { + "epoch": 6.697710516103997, + "grad_norm": 1.741325855255127, + "learning_rate": 0.0002, + "loss": 0.9218, + "step": 8630 + }, + { + "epoch": 6.70547147846333, + "grad_norm": 1.5089004039764404, + "learning_rate": 0.0002, + "loss": 1.0475, + "step": 8640 + }, + { + "epoch": 6.713232440822662, + "grad_norm": 1.4725582599639893, + "learning_rate": 0.0002, + "loss": 1.0009, + "step": 8650 + }, + { + "epoch": 6.7209934031819945, + "grad_norm": 1.6992095708847046, + "learning_rate": 0.0002, + "loss": 0.9818, + "step": 8660 + }, + { + "epoch": 6.728754365541327, + "grad_norm": 1.5938470363616943, + "learning_rate": 0.0002, + "loss": 0.9229, + "step": 8670 + }, + { + "epoch": 6.736515327900659, + "grad_norm": 1.58723783493042, + "learning_rate": 0.0002, + "loss": 0.9411, + "step": 8680 + }, + { + "epoch": 6.744276290259992, + "grad_norm": 1.514389991760254, + "learning_rate": 0.0002, + "loss": 0.9738, + "step": 8690 + }, + { + "epoch": 6.752037252619325, + "grad_norm": 1.6799157857894897, + "learning_rate": 0.0002, + "loss": 0.9283, + "step": 8700 + }, + { + "epoch": 6.759798214978657, + "grad_norm": 1.5436359643936157, + "learning_rate": 0.0002, + "loss": 0.9138, + "step": 8710 + }, + { + "epoch": 6.76755917733799, + "grad_norm": 1.477137565612793, + "learning_rate": 0.0002, + "loss": 0.9678, + "step": 8720 + }, + { + "epoch": 6.775320139697323, + "grad_norm": 1.7383503913879395, + "learning_rate": 0.0002, + "loss": 1.0044, + "step": 8730 + }, + { + "epoch": 6.783081102056655, + "grad_norm": 1.8000324964523315, + "learning_rate": 0.0002, + "loss": 0.9492, + "step": 8740 + }, + { + "epoch": 6.790842064415988, + "grad_norm": 1.3099453449249268, + "learning_rate": 0.0002, + "loss": 0.8943, + "step": 8750 + }, + { + "epoch": 6.79860302677532, + "grad_norm": 1.8775172233581543, + "learning_rate": 0.0002, + "loss": 0.9709, + "step": 8760 + }, + { + "epoch": 6.806363989134653, + "grad_norm": 1.5832085609436035, + "learning_rate": 0.0002, + "loss": 0.9356, + "step": 8770 + }, + { + "epoch": 6.8141249514939854, + "grad_norm": 1.4903252124786377, + "learning_rate": 0.0002, + "loss": 0.9397, + "step": 8780 + }, + { + "epoch": 6.821885913853317, + "grad_norm": 1.6360470056533813, + "learning_rate": 0.0002, + "loss": 0.9602, + "step": 8790 + }, + { + "epoch": 6.82964687621265, + "grad_norm": 1.5457707643508911, + "learning_rate": 0.0002, + "loss": 0.957, + "step": 8800 + }, + { + "epoch": 6.837407838571983, + "grad_norm": 1.5449066162109375, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 8810 + }, + { + "epoch": 6.845168800931315, + "grad_norm": 1.618337631225586, + "learning_rate": 0.0002, + "loss": 1.0007, + "step": 8820 + }, + { + "epoch": 6.852929763290648, + "grad_norm": 1.38296639919281, + "learning_rate": 0.0002, + "loss": 0.9314, + "step": 8830 + }, + { + "epoch": 6.860690725649981, + "grad_norm": 1.6427991390228271, + "learning_rate": 0.0002, + "loss": 0.9349, + "step": 8840 + }, + { + "epoch": 6.868451688009313, + "grad_norm": 1.4980270862579346, + "learning_rate": 0.0002, + "loss": 1.0194, + "step": 8850 + }, + { + "epoch": 6.876212650368646, + "grad_norm": 1.3800020217895508, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 8860 + }, + { + "epoch": 6.883973612727978, + "grad_norm": 1.5971838235855103, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 8870 + }, + { + "epoch": 6.891734575087311, + "grad_norm": 1.4429489374160767, + "learning_rate": 0.0002, + "loss": 1.0105, + "step": 8880 + }, + { + "epoch": 6.899495537446644, + "grad_norm": 1.4959166049957275, + "learning_rate": 0.0002, + "loss": 0.9143, + "step": 8890 + }, + { + "epoch": 6.907256499805976, + "grad_norm": 1.5776222944259644, + "learning_rate": 0.0002, + "loss": 0.9403, + "step": 8900 + }, + { + "epoch": 6.915017462165308, + "grad_norm": 1.510412573814392, + "learning_rate": 0.0002, + "loss": 0.9256, + "step": 8910 + }, + { + "epoch": 6.922778424524641, + "grad_norm": 1.7216295003890991, + "learning_rate": 0.0002, + "loss": 1.0095, + "step": 8920 + }, + { + "epoch": 6.930539386883973, + "grad_norm": 1.830762505531311, + "learning_rate": 0.0002, + "loss": 0.9464, + "step": 8930 + }, + { + "epoch": 6.938300349243306, + "grad_norm": 1.3472434282302856, + "learning_rate": 0.0002, + "loss": 0.9704, + "step": 8940 + }, + { + "epoch": 6.946061311602639, + "grad_norm": 1.5748040676116943, + "learning_rate": 0.0002, + "loss": 0.9718, + "step": 8950 + }, + { + "epoch": 6.953822273961971, + "grad_norm": 1.5317506790161133, + "learning_rate": 0.0002, + "loss": 0.9891, + "step": 8960 + }, + { + "epoch": 6.961583236321304, + "grad_norm": 1.5565721988677979, + "learning_rate": 0.0002, + "loss": 0.9513, + "step": 8970 + }, + { + "epoch": 6.969344198680636, + "grad_norm": 1.5288970470428467, + "learning_rate": 0.0002, + "loss": 0.9118, + "step": 8980 + }, + { + "epoch": 6.977105161039969, + "grad_norm": 1.562624454498291, + "learning_rate": 0.0002, + "loss": 0.9789, + "step": 8990 + }, + { + "epoch": 6.984866123399302, + "grad_norm": 1.3777633905410767, + "learning_rate": 0.0002, + "loss": 0.9929, + "step": 9000 + }, + { + "epoch": 6.992627085758635, + "grad_norm": 1.5868972539901733, + "learning_rate": 0.0002, + "loss": 0.9713, + "step": 9010 + }, + { + "epoch": 6.9996119518820334, + "eval_loss": 2.4372169971466064, + "eval_runtime": 113.8966, + "eval_samples_per_second": 4.451, + "eval_steps_per_second": 0.562, + "step": 9019 + }, + { + "epoch": 7.0003880481179666, + "grad_norm": 1.3035310506820679, + "learning_rate": 0.0002, + "loss": 0.9911, + "step": 9020 + }, + { + "epoch": 7.008149010477299, + "grad_norm": 1.5640218257904053, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 9030 + }, + { + "epoch": 7.015909972836631, + "grad_norm": 1.9529098272323608, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 9040 + }, + { + "epoch": 7.023670935195964, + "grad_norm": 1.6257457733154297, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 9050 + }, + { + "epoch": 7.031431897555297, + "grad_norm": 1.8028602600097656, + "learning_rate": 0.0002, + "loss": 0.6752, + "step": 9060 + }, + { + "epoch": 7.039192859914629, + "grad_norm": 1.4882043600082397, + "learning_rate": 0.0002, + "loss": 0.66, + "step": 9070 + }, + { + "epoch": 7.046953822273962, + "grad_norm": 2.10062837600708, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 9080 + }, + { + "epoch": 7.054714784633295, + "grad_norm": 1.6754050254821777, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 9090 + }, + { + "epoch": 7.062475746992627, + "grad_norm": 1.9425220489501953, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 9100 + }, + { + "epoch": 7.07023670935196, + "grad_norm": 1.9451842308044434, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 9110 + }, + { + "epoch": 7.077997671711292, + "grad_norm": 2.203806161880493, + "learning_rate": 0.0002, + "loss": 0.6872, + "step": 9120 + }, + { + "epoch": 7.085758634070625, + "grad_norm": 1.7248806953430176, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 9130 + }, + { + "epoch": 7.0935195964299576, + "grad_norm": 1.5713436603546143, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 9140 + }, + { + "epoch": 7.1012805587892895, + "grad_norm": 2.0308637619018555, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 9150 + }, + { + "epoch": 7.109041521148622, + "grad_norm": 1.7522761821746826, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 9160 + }, + { + "epoch": 7.116802483507955, + "grad_norm": 1.9685192108154297, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 9170 + }, + { + "epoch": 7.124563445867287, + "grad_norm": 2.069120407104492, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 9180 + }, + { + "epoch": 7.13232440822662, + "grad_norm": 1.7211129665374756, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 9190 + }, + { + "epoch": 7.140085370585953, + "grad_norm": 1.7535923719406128, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 9200 + }, + { + "epoch": 7.147846332945285, + "grad_norm": 1.7181583642959595, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 9210 + }, + { + "epoch": 7.155607295304618, + "grad_norm": 1.7778624296188354, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 9220 + }, + { + "epoch": 7.16336825766395, + "grad_norm": 1.8051576614379883, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 9230 + }, + { + "epoch": 7.171129220023283, + "grad_norm": 1.9704501628875732, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 9240 + }, + { + "epoch": 7.178890182382616, + "grad_norm": 1.8750483989715576, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 9250 + }, + { + "epoch": 7.186651144741948, + "grad_norm": 1.790107011795044, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 9260 + }, + { + "epoch": 7.1944121071012805, + "grad_norm": 1.9681477546691895, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 9270 + }, + { + "epoch": 7.202173069460613, + "grad_norm": 1.7811331748962402, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 9280 + }, + { + "epoch": 7.209934031819945, + "grad_norm": 1.9861894845962524, + "learning_rate": 0.0002, + "loss": 0.6959, + "step": 9290 + }, + { + "epoch": 7.217694994179278, + "grad_norm": 1.8615444898605347, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 9300 + }, + { + "epoch": 7.225455956538611, + "grad_norm": 1.899372935295105, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 9310 + }, + { + "epoch": 7.233216918897943, + "grad_norm": 1.8131160736083984, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 9320 + }, + { + "epoch": 7.240977881257276, + "grad_norm": 1.5020049810409546, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 9330 + }, + { + "epoch": 7.248738843616608, + "grad_norm": 1.7210577726364136, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 9340 + }, + { + "epoch": 7.256499805975941, + "grad_norm": 2.003021001815796, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 9350 + }, + { + "epoch": 7.264260768335274, + "grad_norm": 1.6632959842681885, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 9360 + }, + { + "epoch": 7.272021730694606, + "grad_norm": 1.7717185020446777, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 9370 + }, + { + "epoch": 7.279782693053939, + "grad_norm": 1.8554900884628296, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 9380 + }, + { + "epoch": 7.2875436554132715, + "grad_norm": 1.889708399772644, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 9390 + }, + { + "epoch": 7.2953046177726035, + "grad_norm": 1.8426263332366943, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 9400 + }, + { + "epoch": 7.303065580131936, + "grad_norm": 1.9665130376815796, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 9410 + }, + { + "epoch": 7.310826542491269, + "grad_norm": 1.9337282180786133, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 9420 + }, + { + "epoch": 7.318587504850601, + "grad_norm": 1.8582539558410645, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 9430 + }, + { + "epoch": 7.326348467209934, + "grad_norm": 1.8865947723388672, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 9440 + }, + { + "epoch": 7.334109429569267, + "grad_norm": 1.8144744634628296, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 9450 + }, + { + "epoch": 7.341870391928599, + "grad_norm": 1.6930001974105835, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 9460 + }, + { + "epoch": 7.349631354287932, + "grad_norm": 1.7389107942581177, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 9470 + }, + { + "epoch": 7.357392316647264, + "grad_norm": 1.6860785484313965, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 9480 + }, + { + "epoch": 7.365153279006597, + "grad_norm": 2.2142202854156494, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 9490 + }, + { + "epoch": 7.37291424136593, + "grad_norm": 1.9988185167312622, + "learning_rate": 0.0002, + "loss": 0.7936, + "step": 9500 + }, + { + "epoch": 7.380675203725262, + "grad_norm": 1.7517266273498535, + "learning_rate": 0.0002, + "loss": 0.7348, + "step": 9510 + }, + { + "epoch": 7.3884361660845945, + "grad_norm": 1.7426788806915283, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 9520 + }, + { + "epoch": 7.396197128443927, + "grad_norm": 1.8157157897949219, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 9530 + }, + { + "epoch": 7.403958090803259, + "grad_norm": 1.9826514720916748, + "learning_rate": 0.0002, + "loss": 0.716, + "step": 9540 + }, + { + "epoch": 7.411719053162592, + "grad_norm": 1.9057488441467285, + "learning_rate": 0.0002, + "loss": 0.7685, + "step": 9550 + }, + { + "epoch": 7.419480015521925, + "grad_norm": 2.0701088905334473, + "learning_rate": 0.0002, + "loss": 0.7709, + "step": 9560 + }, + { + "epoch": 7.427240977881257, + "grad_norm": 1.8623783588409424, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 9570 + }, + { + "epoch": 7.43500194024059, + "grad_norm": 1.780672550201416, + "learning_rate": 0.0002, + "loss": 0.7719, + "step": 9580 + }, + { + "epoch": 7.442762902599922, + "grad_norm": 1.8437316417694092, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 9590 + }, + { + "epoch": 7.450523864959255, + "grad_norm": 1.9327329397201538, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 9600 + }, + { + "epoch": 7.458284827318588, + "grad_norm": 2.011535167694092, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 9610 + }, + { + "epoch": 7.46604578967792, + "grad_norm": 1.7557756900787354, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 9620 + }, + { + "epoch": 7.473806752037253, + "grad_norm": 1.8508951663970947, + "learning_rate": 0.0002, + "loss": 0.7461, + "step": 9630 + }, + { + "epoch": 7.4815677143965855, + "grad_norm": 1.8236663341522217, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 9640 + }, + { + "epoch": 7.4893286767559175, + "grad_norm": 1.9308991432189941, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 9650 + }, + { + "epoch": 7.49708963911525, + "grad_norm": 1.8095420598983765, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 9660 + }, + { + "epoch": 7.504850601474583, + "grad_norm": 1.8216804265975952, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 9670 + }, + { + "epoch": 7.512611563833915, + "grad_norm": 1.8275913000106812, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 9680 + }, + { + "epoch": 7.520372526193248, + "grad_norm": 1.8217864036560059, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 9690 + }, + { + "epoch": 7.52813348855258, + "grad_norm": 1.461728572845459, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 9700 + }, + { + "epoch": 7.535894450911913, + "grad_norm": 1.6569337844848633, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 9710 + }, + { + "epoch": 7.543655413271246, + "grad_norm": 2.332101821899414, + "learning_rate": 0.0002, + "loss": 0.8117, + "step": 9720 + }, + { + "epoch": 7.551416375630578, + "grad_norm": 1.760307788848877, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 9730 + }, + { + "epoch": 7.559177337989911, + "grad_norm": 2.2455198764801025, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 9740 + }, + { + "epoch": 7.566938300349244, + "grad_norm": 1.819676399230957, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 9750 + }, + { + "epoch": 7.574699262708576, + "grad_norm": 1.579603910446167, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 9760 + }, + { + "epoch": 7.5824602250679085, + "grad_norm": 1.9687446355819702, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 9770 + }, + { + "epoch": 7.590221187427241, + "grad_norm": 1.859029769897461, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 9780 + }, + { + "epoch": 7.597982149786573, + "grad_norm": 1.652137279510498, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 9790 + }, + { + "epoch": 7.605743112145906, + "grad_norm": 1.6381458044052124, + "learning_rate": 0.0002, + "loss": 0.7388, + "step": 9800 + }, + { + "epoch": 7.613504074505238, + "grad_norm": 1.8706856966018677, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 9810 + }, + { + "epoch": 7.621265036864571, + "grad_norm": 1.8709853887557983, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 9820 + }, + { + "epoch": 7.629025999223904, + "grad_norm": 2.1183695793151855, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 9830 + }, + { + "epoch": 7.636786961583236, + "grad_norm": 1.6770579814910889, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9840 + }, + { + "epoch": 7.644547923942569, + "grad_norm": 2.0952677726745605, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 9850 + }, + { + "epoch": 7.652308886301902, + "grad_norm": 1.7378171682357788, + "learning_rate": 0.0002, + "loss": 0.8483, + "step": 9860 + }, + { + "epoch": 7.660069848661234, + "grad_norm": 2.0860157012939453, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 9870 + }, + { + "epoch": 7.667830811020567, + "grad_norm": 1.6986967325210571, + "learning_rate": 0.0002, + "loss": 0.769, + "step": 9880 + }, + { + "epoch": 7.6755917733798995, + "grad_norm": 1.666702151298523, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 9890 + }, + { + "epoch": 7.6833527357392315, + "grad_norm": 2.446931838989258, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 9900 + }, + { + "epoch": 7.691113698098564, + "grad_norm": 1.6984577178955078, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 9910 + }, + { + "epoch": 7.698874660457896, + "grad_norm": 2.0732316970825195, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 9920 + }, + { + "epoch": 7.706635622817229, + "grad_norm": 1.9884073734283447, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 9930 + }, + { + "epoch": 7.714396585176562, + "grad_norm": 1.8669427633285522, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 9940 + }, + { + "epoch": 7.722157547535894, + "grad_norm": 1.6163996458053589, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 9950 + }, + { + "epoch": 7.729918509895227, + "grad_norm": 2.4492127895355225, + "learning_rate": 0.0002, + "loss": 0.8069, + "step": 9960 + }, + { + "epoch": 7.73767947225456, + "grad_norm": 1.5625537633895874, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 9970 + }, + { + "epoch": 7.745440434613892, + "grad_norm": 1.8205251693725586, + "learning_rate": 0.0002, + "loss": 0.8273, + "step": 9980 + }, + { + "epoch": 7.753201396973225, + "grad_norm": 1.9902361631393433, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 9990 + }, + { + "epoch": 7.760962359332558, + "grad_norm": 1.8294041156768799, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 10000 + }, + { + "epoch": 7.76872332169189, + "grad_norm": 1.8938281536102295, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 10010 + }, + { + "epoch": 7.7764842840512225, + "grad_norm": 1.9367564916610718, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 10020 + }, + { + "epoch": 7.7842452464105545, + "grad_norm": 1.9130750894546509, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 10030 + }, + { + "epoch": 7.792006208769887, + "grad_norm": 1.931505799293518, + "learning_rate": 0.0002, + "loss": 0.8359, + "step": 10040 + }, + { + "epoch": 7.79976717112922, + "grad_norm": 2.107954740524292, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 10050 + }, + { + "epoch": 7.807528133488552, + "grad_norm": 1.8736399412155151, + "learning_rate": 0.0002, + "loss": 0.7922, + "step": 10060 + }, + { + "epoch": 7.815289095847885, + "grad_norm": 1.6943566799163818, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 10070 + }, + { + "epoch": 7.823050058207218, + "grad_norm": 2.365346908569336, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 10080 + }, + { + "epoch": 7.83081102056655, + "grad_norm": 2.0044665336608887, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 10090 + }, + { + "epoch": 7.838571982925883, + "grad_norm": 1.7680124044418335, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 10100 + }, + { + "epoch": 7.846332945285216, + "grad_norm": 1.8494547605514526, + "learning_rate": 0.0002, + "loss": 0.7729, + "step": 10110 + }, + { + "epoch": 7.854093907644548, + "grad_norm": 1.822107195854187, + "learning_rate": 0.0002, + "loss": 0.7909, + "step": 10120 + }, + { + "epoch": 7.861854870003881, + "grad_norm": 1.8191527128219604, + "learning_rate": 0.0002, + "loss": 0.8149, + "step": 10130 + }, + { + "epoch": 7.869615832363213, + "grad_norm": 1.8011466264724731, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 10140 + }, + { + "epoch": 7.8773767947225455, + "grad_norm": 1.7087100744247437, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 10150 + }, + { + "epoch": 7.885137757081878, + "grad_norm": 1.7698721885681152, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 10160 + }, + { + "epoch": 7.89289871944121, + "grad_norm": 1.9578531980514526, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 10170 + }, + { + "epoch": 7.900659681800543, + "grad_norm": 1.7660179138183594, + "learning_rate": 0.0002, + "loss": 0.8497, + "step": 10180 + }, + { + "epoch": 7.908420644159876, + "grad_norm": 2.0122673511505127, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 10190 + }, + { + "epoch": 7.916181606519208, + "grad_norm": 1.737443208694458, + "learning_rate": 0.0002, + "loss": 0.7607, + "step": 10200 + }, + { + "epoch": 7.923942568878541, + "grad_norm": 1.6381052732467651, + "learning_rate": 0.0002, + "loss": 0.8365, + "step": 10210 + }, + { + "epoch": 7.931703531237874, + "grad_norm": 1.8845038414001465, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 10220 + }, + { + "epoch": 7.939464493597206, + "grad_norm": 1.952194333076477, + "learning_rate": 0.0002, + "loss": 0.8354, + "step": 10230 + }, + { + "epoch": 7.947225455956539, + "grad_norm": 1.7254410982131958, + "learning_rate": 0.0002, + "loss": 0.8428, + "step": 10240 + }, + { + "epoch": 7.954986418315871, + "grad_norm": 2.14776873588562, + "learning_rate": 0.0002, + "loss": 0.834, + "step": 10250 + }, + { + "epoch": 7.962747380675204, + "grad_norm": 1.7655725479125977, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 10260 + }, + { + "epoch": 7.9705083430345365, + "grad_norm": 1.7337331771850586, + "learning_rate": 0.0002, + "loss": 0.8176, + "step": 10270 + }, + { + "epoch": 7.9782693053938685, + "grad_norm": 1.7742228507995605, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 10280 + }, + { + "epoch": 7.986030267753201, + "grad_norm": 1.892137050628662, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 10290 + }, + { + "epoch": 7.993791230112534, + "grad_norm": 1.8636300563812256, + "learning_rate": 0.0002, + "loss": 0.8243, + "step": 10300 + }, + { + "epoch": 7.996895615056267, + "eval_loss": 2.7464993000030518, + "eval_runtime": 114.078, + "eval_samples_per_second": 4.444, + "eval_steps_per_second": 0.561, + "step": 10304 + } + ], + "logging_steps": 10, + "max_steps": 10304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.2925203495256064e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8ca6e2c3ac58fa2af9f99747566f932f41a5a4d5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-10304/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f7feb06ff53d5bf79374054a25b662309e705a2ca08dfa3b0bce7b8b4632fae +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5208229e29fb38c73343f46a72a64976ddf01127 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a18343e5c8a561cfe90c00254345da8a62839392f1f92154472ccfeac3f0492e +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7e289d6e4092c63f1963d86dd2f62e0ff6e5487 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67873c50dcb6182b8741740688f5c1098ab9b75ddd7a223f632ceffef2b831c2 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c0365d362b90a2c12afa3608502a350b8bec9b5a --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85fdc4271a5e20d6f1f0dd445a31121166a83dd6ab00011bf0901a2ca8c5c7a2 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a59812c3593eca842c322aba5a497cd945702bc --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a17393a475a6079ffa2869cfabd4d1a282c203c4adf92772c840de5a59a54b6b +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e3d851b24912a599aea296ec87d2e80723886f86 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/trainer_state.json @@ -0,0 +1,937 @@ +{ + "best_metric": 1.8081045150756836, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288", + "epoch": 0.9996119518820333, + "eval_steps": 10, + "global_step": 1288, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007760962359332557, + "grad_norm": 1.0751162767410278, + "learning_rate": 0.0002, + "loss": 3.0855, + "step": 10 + }, + { + "epoch": 0.015521924718665115, + "grad_norm": 0.4697345793247223, + "learning_rate": 0.0002, + "loss": 2.4744, + "step": 20 + }, + { + "epoch": 0.023282887077997673, + "grad_norm": 0.5370839238166809, + "learning_rate": 0.0002, + "loss": 2.193, + "step": 30 + }, + { + "epoch": 0.03104384943733023, + "grad_norm": 0.46794816851615906, + "learning_rate": 0.0002, + "loss": 2.0599, + "step": 40 + }, + { + "epoch": 0.038804811796662786, + "grad_norm": 0.44624820351600647, + "learning_rate": 0.0002, + "loss": 1.9354, + "step": 50 + }, + { + "epoch": 0.046565774155995346, + "grad_norm": 0.3953201472759247, + "learning_rate": 0.0002, + "loss": 1.9319, + "step": 60 + }, + { + "epoch": 0.0543267365153279, + "grad_norm": 0.3935912549495697, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 70 + }, + { + "epoch": 0.06208769887466046, + "grad_norm": 0.4520699381828308, + "learning_rate": 0.0002, + "loss": 1.8795, + "step": 80 + }, + { + "epoch": 0.06984866123399301, + "grad_norm": 0.3801847994327545, + "learning_rate": 0.0002, + "loss": 1.8354, + "step": 90 + }, + { + "epoch": 0.07760962359332557, + "grad_norm": 0.4020165205001831, + "learning_rate": 0.0002, + "loss": 1.9053, + "step": 100 + }, + { + "epoch": 0.08537058595265813, + "grad_norm": 0.3860672116279602, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 110 + }, + { + "epoch": 0.09313154831199069, + "grad_norm": 0.3681113123893738, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 120 + }, + { + "epoch": 0.10089251067132324, + "grad_norm": 0.3594866991043091, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 130 + }, + { + "epoch": 0.1086534730306558, + "grad_norm": 0.3879193663597107, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 140 + }, + { + "epoch": 0.11641443538998836, + "grad_norm": 0.3270505666732788, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 150 + }, + { + "epoch": 0.12417539774932092, + "grad_norm": 0.36824458837509155, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 160 + }, + { + "epoch": 0.13193636010865348, + "grad_norm": 0.383882075548172, + "learning_rate": 0.0002, + "loss": 1.8305, + "step": 170 + }, + { + "epoch": 0.13969732246798602, + "grad_norm": 0.3368665874004364, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 180 + }, + { + "epoch": 0.1474582848273186, + "grad_norm": 0.35961097478866577, + "learning_rate": 0.0002, + "loss": 1.7882, + "step": 190 + }, + { + "epoch": 0.15521924718665114, + "grad_norm": 0.3415963351726532, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 200 + }, + { + "epoch": 0.1629802095459837, + "grad_norm": 0.4100632071495056, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 210 + }, + { + "epoch": 0.17074117190531626, + "grad_norm": 0.3516307473182678, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 220 + }, + { + "epoch": 0.1785021342646488, + "grad_norm": 0.37919050455093384, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 230 + }, + { + "epoch": 0.18626309662398138, + "grad_norm": 0.33270683884620667, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 240 + }, + { + "epoch": 0.19402405898331393, + "grad_norm": 0.3348783254623413, + "learning_rate": 0.0002, + "loss": 1.7925, + "step": 250 + }, + { + "epoch": 0.20178502134264648, + "grad_norm": 0.3888475298881531, + "learning_rate": 0.0002, + "loss": 1.7774, + "step": 260 + }, + { + "epoch": 0.20954598370197905, + "grad_norm": 0.3554602861404419, + "learning_rate": 0.0002, + "loss": 1.8381, + "step": 270 + }, + { + "epoch": 0.2173069460613116, + "grad_norm": 0.33277708292007446, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 280 + }, + { + "epoch": 0.22506790842064417, + "grad_norm": 0.3281584680080414, + "learning_rate": 0.0002, + "loss": 1.7713, + "step": 290 + }, + { + "epoch": 0.23282887077997672, + "grad_norm": 0.3185969591140747, + "learning_rate": 0.0002, + "loss": 1.8181, + "step": 300 + }, + { + "epoch": 0.24058983313930926, + "grad_norm": 0.35335442423820496, + "learning_rate": 0.0002, + "loss": 1.8595, + "step": 310 + }, + { + "epoch": 0.24835079549864184, + "grad_norm": 0.3119595944881439, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 320 + }, + { + "epoch": 0.2561117578579744, + "grad_norm": 0.36424458026885986, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 330 + }, + { + "epoch": 0.26387272021730696, + "grad_norm": 0.3618951141834259, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 340 + }, + { + "epoch": 0.2716336825766395, + "grad_norm": 0.312757670879364, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 350 + }, + { + "epoch": 0.27939464493597205, + "grad_norm": 0.326016366481781, + "learning_rate": 0.0002, + "loss": 1.9031, + "step": 360 + }, + { + "epoch": 0.2871556072953046, + "grad_norm": 0.34093883633613586, + "learning_rate": 0.0002, + "loss": 1.8214, + "step": 370 + }, + { + "epoch": 0.2949165696546372, + "grad_norm": 0.32325029373168945, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 380 + }, + { + "epoch": 0.30267753201396974, + "grad_norm": 0.34105437994003296, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 390 + }, + { + "epoch": 0.3104384943733023, + "grad_norm": 0.32565295696258545, + "learning_rate": 0.0002, + "loss": 1.7926, + "step": 400 + }, + { + "epoch": 0.31819945673263483, + "grad_norm": 0.32742050290107727, + "learning_rate": 0.0002, + "loss": 1.8031, + "step": 410 + }, + { + "epoch": 0.3259604190919674, + "grad_norm": 0.30233046412467957, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 420 + }, + { + "epoch": 0.3337213814513, + "grad_norm": 0.32419222593307495, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 430 + }, + { + "epoch": 0.3414823438106325, + "grad_norm": 0.3653007745742798, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 440 + }, + { + "epoch": 0.3492433061699651, + "grad_norm": 0.31617099046707153, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 450 + }, + { + "epoch": 0.3570042685292976, + "grad_norm": 0.3305962085723877, + "learning_rate": 0.0002, + "loss": 1.7677, + "step": 460 + }, + { + "epoch": 0.36476523088863017, + "grad_norm": 0.3178933262825012, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 470 + }, + { + "epoch": 0.37252619324796277, + "grad_norm": 0.37163782119750977, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 480 + }, + { + "epoch": 0.3802871556072953, + "grad_norm": 0.469844788312912, + "learning_rate": 0.0002, + "loss": 1.8804, + "step": 490 + }, + { + "epoch": 0.38804811796662786, + "grad_norm": 0.3409338593482971, + "learning_rate": 0.0002, + "loss": 1.8343, + "step": 500 + }, + { + "epoch": 0.3958090803259604, + "grad_norm": 0.31943467259407043, + "learning_rate": 0.0002, + "loss": 1.8433, + "step": 510 + }, + { + "epoch": 0.40357004268529295, + "grad_norm": 0.32293614745140076, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 520 + }, + { + "epoch": 0.41133100504462555, + "grad_norm": 0.2994382977485657, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 530 + }, + { + "epoch": 0.4190919674039581, + "grad_norm": 0.3273141384124756, + "learning_rate": 0.0002, + "loss": 1.8153, + "step": 540 + }, + { + "epoch": 0.42685292976329064, + "grad_norm": 0.3020550012588501, + "learning_rate": 0.0002, + "loss": 1.8097, + "step": 550 + }, + { + "epoch": 0.4346138921226232, + "grad_norm": 0.30113112926483154, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 560 + }, + { + "epoch": 0.44237485448195574, + "grad_norm": 0.30274903774261475, + "learning_rate": 0.0002, + "loss": 1.8084, + "step": 570 + }, + { + "epoch": 0.45013581684128834, + "grad_norm": 0.3231128454208374, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 580 + }, + { + "epoch": 0.4578967792006209, + "grad_norm": 0.3255121409893036, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 590 + }, + { + "epoch": 0.46565774155995343, + "grad_norm": 0.30147507786750793, + "learning_rate": 0.0002, + "loss": 1.8227, + "step": 600 + }, + { + "epoch": 0.473418703919286, + "grad_norm": 0.29781386256217957, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 610 + }, + { + "epoch": 0.4811796662786185, + "grad_norm": 0.30914685130119324, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 620 + }, + { + "epoch": 0.4889406286379511, + "grad_norm": 0.3110593855381012, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 630 + }, + { + "epoch": 0.49670159099728367, + "grad_norm": 0.3298132121562958, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 640 + }, + { + "epoch": 0.5044625533566163, + "grad_norm": 0.322122186422348, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 650 + }, + { + "epoch": 0.5122235157159488, + "grad_norm": 0.3504371643066406, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 660 + }, + { + "epoch": 0.5199844780752814, + "grad_norm": 0.3102182149887085, + "learning_rate": 0.0002, + "loss": 1.8682, + "step": 670 + }, + { + "epoch": 0.5277454404346139, + "grad_norm": 0.6113658547401428, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 680 + }, + { + "epoch": 0.5355064027939465, + "grad_norm": 0.31841862201690674, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 690 + }, + { + "epoch": 0.543267365153279, + "grad_norm": 0.2830526530742645, + "learning_rate": 0.0002, + "loss": 1.7587, + "step": 700 + }, + { + "epoch": 0.5510283275126115, + "grad_norm": 0.3048769533634186, + "learning_rate": 0.0002, + "loss": 1.7887, + "step": 710 + }, + { + "epoch": 0.5587892898719441, + "grad_norm": 0.2719033658504486, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 720 + }, + { + "epoch": 0.5665502522312766, + "grad_norm": 0.3176722526550293, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 730 + }, + { + "epoch": 0.5743112145906092, + "grad_norm": 0.32491734623908997, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 740 + }, + { + "epoch": 0.5820721769499418, + "grad_norm": 0.32746851444244385, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 750 + }, + { + "epoch": 0.5898331393092744, + "grad_norm": 0.3055773973464966, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 760 + }, + { + "epoch": 0.5975941016686069, + "grad_norm": 0.30671584606170654, + "learning_rate": 0.0002, + "loss": 1.8597, + "step": 770 + }, + { + "epoch": 0.6053550640279395, + "grad_norm": 0.28770264983177185, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 780 + }, + { + "epoch": 0.613116026387272, + "grad_norm": 0.2814285457134247, + "learning_rate": 0.0002, + "loss": 1.7025, + "step": 790 + }, + { + "epoch": 0.6208769887466046, + "grad_norm": 0.31554412841796875, + "learning_rate": 0.0002, + "loss": 1.819, + "step": 800 + }, + { + "epoch": 0.6286379511059371, + "grad_norm": 0.2984226942062378, + "learning_rate": 0.0002, + "loss": 1.8335, + "step": 810 + }, + { + "epoch": 0.6363989134652697, + "grad_norm": 0.2859906554222107, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 820 + }, + { + "epoch": 0.6441598758246022, + "grad_norm": 0.2887928783893585, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 830 + }, + { + "epoch": 0.6519208381839348, + "grad_norm": 0.31287339329719543, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 840 + }, + { + "epoch": 0.6596818005432674, + "grad_norm": 0.32064181566238403, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 850 + }, + { + "epoch": 0.6674427629026, + "grad_norm": 0.290981650352478, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 860 + }, + { + "epoch": 0.6752037252619325, + "grad_norm": 0.33060121536254883, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 870 + }, + { + "epoch": 0.682964687621265, + "grad_norm": 0.27032899856567383, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 880 + }, + { + "epoch": 0.6907256499805976, + "grad_norm": 0.29031234979629517, + "learning_rate": 0.0002, + "loss": 1.8423, + "step": 890 + }, + { + "epoch": 0.6984866123399301, + "grad_norm": 0.2845142185688019, + "learning_rate": 0.0002, + "loss": 1.835, + "step": 900 + }, + { + "epoch": 0.7062475746992627, + "grad_norm": 0.8638312816619873, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 910 + }, + { + "epoch": 0.7140085370585952, + "grad_norm": 0.3086668848991394, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 920 + }, + { + "epoch": 0.7217694994179278, + "grad_norm": 0.2724177837371826, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 930 + }, + { + "epoch": 0.7295304617772603, + "grad_norm": 0.289559006690979, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 940 + }, + { + "epoch": 0.737291424136593, + "grad_norm": 0.3000658452510834, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 950 + }, + { + "epoch": 0.7450523864959255, + "grad_norm": 0.33544042706489563, + "learning_rate": 0.0002, + "loss": 1.7736, + "step": 960 + }, + { + "epoch": 0.7528133488552581, + "grad_norm": 0.28593236207962036, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 970 + }, + { + "epoch": 0.7605743112145906, + "grad_norm": 0.313634991645813, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 980 + }, + { + "epoch": 0.7683352735739232, + "grad_norm": 0.2949385941028595, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 990 + }, + { + "epoch": 0.7760962359332557, + "grad_norm": 0.2920108437538147, + "learning_rate": 0.0002, + "loss": 1.8689, + "step": 1000 + }, + { + "epoch": 0.7838571982925883, + "grad_norm": 0.3245100677013397, + "learning_rate": 0.0002, + "loss": 1.8401, + "step": 1010 + }, + { + "epoch": 0.7916181606519208, + "grad_norm": 0.3007619380950928, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 1020 + }, + { + "epoch": 0.7993791230112534, + "grad_norm": 0.3630852997303009, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 1030 + }, + { + "epoch": 0.8071400853705859, + "grad_norm": 0.2856379747390747, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 1040 + }, + { + "epoch": 0.8149010477299186, + "grad_norm": 0.32476478815078735, + "learning_rate": 0.0002, + "loss": 1.8371, + "step": 1050 + }, + { + "epoch": 0.8226620100892511, + "grad_norm": 0.5162565112113953, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 1060 + }, + { + "epoch": 0.8304229724485837, + "grad_norm": 0.316496342420578, + "learning_rate": 0.0002, + "loss": 1.8862, + "step": 1070 + }, + { + "epoch": 0.8381839348079162, + "grad_norm": 0.31977516412734985, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 1080 + }, + { + "epoch": 0.8459448971672487, + "grad_norm": 0.269509494304657, + "learning_rate": 0.0002, + "loss": 1.8547, + "step": 1090 + }, + { + "epoch": 0.8537058595265813, + "grad_norm": 0.31621453166007996, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 1100 + }, + { + "epoch": 0.8614668218859138, + "grad_norm": 0.2946535050868988, + "learning_rate": 0.0002, + "loss": 1.739, + "step": 1110 + }, + { + "epoch": 0.8692277842452464, + "grad_norm": 0.3088909983634949, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 1120 + }, + { + "epoch": 0.8769887466045789, + "grad_norm": 0.33033716678619385, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 1130 + }, + { + "epoch": 0.8847497089639115, + "grad_norm": 0.2954833507537842, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1140 + }, + { + "epoch": 0.8925106713232441, + "grad_norm": 0.2950248122215271, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 1150 + }, + { + "epoch": 0.9002716336825767, + "grad_norm": 0.296661913394928, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 1160 + }, + { + "epoch": 0.9080325960419092, + "grad_norm": 0.35451310873031616, + "learning_rate": 0.0002, + "loss": 1.7967, + "step": 1170 + }, + { + "epoch": 0.9157935584012418, + "grad_norm": 0.32705947756767273, + "learning_rate": 0.0002, + "loss": 1.8202, + "step": 1180 + }, + { + "epoch": 0.9235545207605743, + "grad_norm": 0.3333960771560669, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1190 + }, + { + "epoch": 0.9313154831199069, + "grad_norm": 0.3042232096195221, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 1200 + }, + { + "epoch": 0.9390764454792394, + "grad_norm": 0.281553715467453, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1210 + }, + { + "epoch": 0.946837407838572, + "grad_norm": 0.3096391558647156, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 1220 + }, + { + "epoch": 0.9545983701979045, + "grad_norm": 0.2866271734237671, + "learning_rate": 0.0002, + "loss": 1.7401, + "step": 1230 + }, + { + "epoch": 0.962359332557237, + "grad_norm": 0.28394097089767456, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 1240 + }, + { + "epoch": 0.9701202949165697, + "grad_norm": 0.3249266743659973, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 1250 + }, + { + "epoch": 0.9778812572759022, + "grad_norm": 0.2896869480609894, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1260 + }, + { + "epoch": 0.9856422196352348, + "grad_norm": 0.29224586486816406, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 1270 + }, + { + "epoch": 0.9934031819945673, + "grad_norm": 0.2820223569869995, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1280 + }, + { + "epoch": 0.9996119518820333, + "eval_loss": 1.8081045150756836, + "eval_runtime": 102.3056, + "eval_samples_per_second": 4.956, + "eval_steps_per_second": 0.626, + "step": 1288 + } + ], + "logging_steps": 10, + "max_steps": 10304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.618218624188416e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8ca6e2c3ac58fa2af9f99747566f932f41a5a4d5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f7feb06ff53d5bf79374054a25b662309e705a2ca08dfa3b0bce7b8b4632fae +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6736da95acf74ac199158de2bcd2f45e279c6a48 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38b9d4be16ebe4fd6dec61f83710d425e98161ba60be90a89bdaeb9c11beb3ef +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..955f398d8a589f6bb4ef6b0e46582f5db08a524a --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:065e8ea19f402b886f338dbbcfa06cd1dcb161bbbae9716a0d68d69f6a74cfe2 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..210ece09eb39d86bcfd971c1e9bfbaec3f4f12d0 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:366694b55f261d0313ae3ec388be059c6630cdad34deff87df96e95222d25afb +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b73d0bd92d62d986603cc11993cc9270ad145f90 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4358b9377fa65c761c44392416e6d2355af889e9461d685fe4753ef6a2cf58e +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..603844c8e3f1683050ba1437515d04cc5238909d --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/trainer_state.json @@ -0,0 +1,1848 @@ +{ + "best_metric": 1.8068748712539673, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 2577, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007760962359332557, + "grad_norm": 1.0751162767410278, + "learning_rate": 0.0002, + "loss": 3.0855, + "step": 10 + }, + { + "epoch": 0.015521924718665115, + "grad_norm": 0.4697345793247223, + "learning_rate": 0.0002, + "loss": 2.4744, + "step": 20 + }, + { + "epoch": 0.023282887077997673, + "grad_norm": 0.5370839238166809, + "learning_rate": 0.0002, + "loss": 2.193, + "step": 30 + }, + { + "epoch": 0.03104384943733023, + "grad_norm": 0.46794816851615906, + "learning_rate": 0.0002, + "loss": 2.0599, + "step": 40 + }, + { + "epoch": 0.038804811796662786, + "grad_norm": 0.44624820351600647, + "learning_rate": 0.0002, + "loss": 1.9354, + "step": 50 + }, + { + "epoch": 0.046565774155995346, + "grad_norm": 0.3953201472759247, + "learning_rate": 0.0002, + "loss": 1.9319, + "step": 60 + }, + { + "epoch": 0.0543267365153279, + "grad_norm": 0.3935912549495697, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 70 + }, + { + "epoch": 0.06208769887466046, + "grad_norm": 0.4520699381828308, + "learning_rate": 0.0002, + "loss": 1.8795, + "step": 80 + }, + { + "epoch": 0.06984866123399301, + "grad_norm": 0.3801847994327545, + "learning_rate": 0.0002, + "loss": 1.8354, + "step": 90 + }, + { + "epoch": 0.07760962359332557, + "grad_norm": 0.4020165205001831, + "learning_rate": 0.0002, + "loss": 1.9053, + "step": 100 + }, + { + "epoch": 0.08537058595265813, + "grad_norm": 0.3860672116279602, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 110 + }, + { + "epoch": 0.09313154831199069, + "grad_norm": 0.3681113123893738, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 120 + }, + { + "epoch": 0.10089251067132324, + "grad_norm": 0.3594866991043091, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 130 + }, + { + "epoch": 0.1086534730306558, + "grad_norm": 0.3879193663597107, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 140 + }, + { + "epoch": 0.11641443538998836, + "grad_norm": 0.3270505666732788, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 150 + }, + { + "epoch": 0.12417539774932092, + "grad_norm": 0.36824458837509155, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 160 + }, + { + "epoch": 0.13193636010865348, + "grad_norm": 0.383882075548172, + "learning_rate": 0.0002, + "loss": 1.8305, + "step": 170 + }, + { + "epoch": 0.13969732246798602, + "grad_norm": 0.3368665874004364, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 180 + }, + { + "epoch": 0.1474582848273186, + "grad_norm": 0.35961097478866577, + "learning_rate": 0.0002, + "loss": 1.7882, + "step": 190 + }, + { + "epoch": 0.15521924718665114, + "grad_norm": 0.3415963351726532, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 200 + }, + { + "epoch": 0.1629802095459837, + "grad_norm": 0.4100632071495056, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 210 + }, + { + "epoch": 0.17074117190531626, + "grad_norm": 0.3516307473182678, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 220 + }, + { + "epoch": 0.1785021342646488, + "grad_norm": 0.37919050455093384, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 230 + }, + { + "epoch": 0.18626309662398138, + "grad_norm": 0.33270683884620667, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 240 + }, + { + "epoch": 0.19402405898331393, + "grad_norm": 0.3348783254623413, + "learning_rate": 0.0002, + "loss": 1.7925, + "step": 250 + }, + { + "epoch": 0.20178502134264648, + "grad_norm": 0.3888475298881531, + "learning_rate": 0.0002, + "loss": 1.7774, + "step": 260 + }, + { + "epoch": 0.20954598370197905, + "grad_norm": 0.3554602861404419, + "learning_rate": 0.0002, + "loss": 1.8381, + "step": 270 + }, + { + "epoch": 0.2173069460613116, + "grad_norm": 0.33277708292007446, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 280 + }, + { + "epoch": 0.22506790842064417, + "grad_norm": 0.3281584680080414, + "learning_rate": 0.0002, + "loss": 1.7713, + "step": 290 + }, + { + "epoch": 0.23282887077997672, + "grad_norm": 0.3185969591140747, + "learning_rate": 0.0002, + "loss": 1.8181, + "step": 300 + }, + { + "epoch": 0.24058983313930926, + "grad_norm": 0.35335442423820496, + "learning_rate": 0.0002, + "loss": 1.8595, + "step": 310 + }, + { + "epoch": 0.24835079549864184, + "grad_norm": 0.3119595944881439, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 320 + }, + { + "epoch": 0.2561117578579744, + "grad_norm": 0.36424458026885986, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 330 + }, + { + "epoch": 0.26387272021730696, + "grad_norm": 0.3618951141834259, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 340 + }, + { + "epoch": 0.2716336825766395, + "grad_norm": 0.312757670879364, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 350 + }, + { + "epoch": 0.27939464493597205, + "grad_norm": 0.326016366481781, + "learning_rate": 0.0002, + "loss": 1.9031, + "step": 360 + }, + { + "epoch": 0.2871556072953046, + "grad_norm": 0.34093883633613586, + "learning_rate": 0.0002, + "loss": 1.8214, + "step": 370 + }, + { + "epoch": 0.2949165696546372, + "grad_norm": 0.32325029373168945, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 380 + }, + { + "epoch": 0.30267753201396974, + "grad_norm": 0.34105437994003296, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 390 + }, + { + "epoch": 0.3104384943733023, + "grad_norm": 0.32565295696258545, + "learning_rate": 0.0002, + "loss": 1.7926, + "step": 400 + }, + { + "epoch": 0.31819945673263483, + "grad_norm": 0.32742050290107727, + "learning_rate": 0.0002, + "loss": 1.8031, + "step": 410 + }, + { + "epoch": 0.3259604190919674, + "grad_norm": 0.30233046412467957, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 420 + }, + { + "epoch": 0.3337213814513, + "grad_norm": 0.32419222593307495, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 430 + }, + { + "epoch": 0.3414823438106325, + "grad_norm": 0.3653007745742798, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 440 + }, + { + "epoch": 0.3492433061699651, + "grad_norm": 0.31617099046707153, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 450 + }, + { + "epoch": 0.3570042685292976, + "grad_norm": 0.3305962085723877, + "learning_rate": 0.0002, + "loss": 1.7677, + "step": 460 + }, + { + "epoch": 0.36476523088863017, + "grad_norm": 0.3178933262825012, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 470 + }, + { + "epoch": 0.37252619324796277, + "grad_norm": 0.37163782119750977, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 480 + }, + { + "epoch": 0.3802871556072953, + "grad_norm": 0.469844788312912, + "learning_rate": 0.0002, + "loss": 1.8804, + "step": 490 + }, + { + "epoch": 0.38804811796662786, + "grad_norm": 0.3409338593482971, + "learning_rate": 0.0002, + "loss": 1.8343, + "step": 500 + }, + { + "epoch": 0.3958090803259604, + "grad_norm": 0.31943467259407043, + "learning_rate": 0.0002, + "loss": 1.8433, + "step": 510 + }, + { + "epoch": 0.40357004268529295, + "grad_norm": 0.32293614745140076, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 520 + }, + { + "epoch": 0.41133100504462555, + "grad_norm": 0.2994382977485657, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 530 + }, + { + "epoch": 0.4190919674039581, + "grad_norm": 0.3273141384124756, + "learning_rate": 0.0002, + "loss": 1.8153, + "step": 540 + }, + { + "epoch": 0.42685292976329064, + "grad_norm": 0.3020550012588501, + "learning_rate": 0.0002, + "loss": 1.8097, + "step": 550 + }, + { + "epoch": 0.4346138921226232, + "grad_norm": 0.30113112926483154, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 560 + }, + { + "epoch": 0.44237485448195574, + "grad_norm": 0.30274903774261475, + "learning_rate": 0.0002, + "loss": 1.8084, + "step": 570 + }, + { + "epoch": 0.45013581684128834, + "grad_norm": 0.3231128454208374, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 580 + }, + { + "epoch": 0.4578967792006209, + "grad_norm": 0.3255121409893036, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 590 + }, + { + "epoch": 0.46565774155995343, + "grad_norm": 0.30147507786750793, + "learning_rate": 0.0002, + "loss": 1.8227, + "step": 600 + }, + { + "epoch": 0.473418703919286, + "grad_norm": 0.29781386256217957, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 610 + }, + { + "epoch": 0.4811796662786185, + "grad_norm": 0.30914685130119324, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 620 + }, + { + "epoch": 0.4889406286379511, + "grad_norm": 0.3110593855381012, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 630 + }, + { + "epoch": 0.49670159099728367, + "grad_norm": 0.3298132121562958, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 640 + }, + { + "epoch": 0.5044625533566163, + "grad_norm": 0.322122186422348, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 650 + }, + { + "epoch": 0.5122235157159488, + "grad_norm": 0.3504371643066406, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 660 + }, + { + "epoch": 0.5199844780752814, + "grad_norm": 0.3102182149887085, + "learning_rate": 0.0002, + "loss": 1.8682, + "step": 670 + }, + { + "epoch": 0.5277454404346139, + "grad_norm": 0.6113658547401428, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 680 + }, + { + "epoch": 0.5355064027939465, + "grad_norm": 0.31841862201690674, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 690 + }, + { + "epoch": 0.543267365153279, + "grad_norm": 0.2830526530742645, + "learning_rate": 0.0002, + "loss": 1.7587, + "step": 700 + }, + { + "epoch": 0.5510283275126115, + "grad_norm": 0.3048769533634186, + "learning_rate": 0.0002, + "loss": 1.7887, + "step": 710 + }, + { + "epoch": 0.5587892898719441, + "grad_norm": 0.2719033658504486, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 720 + }, + { + "epoch": 0.5665502522312766, + "grad_norm": 0.3176722526550293, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 730 + }, + { + "epoch": 0.5743112145906092, + "grad_norm": 0.32491734623908997, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 740 + }, + { + "epoch": 0.5820721769499418, + "grad_norm": 0.32746851444244385, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 750 + }, + { + "epoch": 0.5898331393092744, + "grad_norm": 0.3055773973464966, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 760 + }, + { + "epoch": 0.5975941016686069, + "grad_norm": 0.30671584606170654, + "learning_rate": 0.0002, + "loss": 1.8597, + "step": 770 + }, + { + "epoch": 0.6053550640279395, + "grad_norm": 0.28770264983177185, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 780 + }, + { + "epoch": 0.613116026387272, + "grad_norm": 0.2814285457134247, + "learning_rate": 0.0002, + "loss": 1.7025, + "step": 790 + }, + { + "epoch": 0.6208769887466046, + "grad_norm": 0.31554412841796875, + "learning_rate": 0.0002, + "loss": 1.819, + "step": 800 + }, + { + "epoch": 0.6286379511059371, + "grad_norm": 0.2984226942062378, + "learning_rate": 0.0002, + "loss": 1.8335, + "step": 810 + }, + { + "epoch": 0.6363989134652697, + "grad_norm": 0.2859906554222107, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 820 + }, + { + "epoch": 0.6441598758246022, + "grad_norm": 0.2887928783893585, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 830 + }, + { + "epoch": 0.6519208381839348, + "grad_norm": 0.31287339329719543, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 840 + }, + { + "epoch": 0.6596818005432674, + "grad_norm": 0.32064181566238403, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 850 + }, + { + "epoch": 0.6674427629026, + "grad_norm": 0.290981650352478, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 860 + }, + { + "epoch": 0.6752037252619325, + "grad_norm": 0.33060121536254883, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 870 + }, + { + "epoch": 0.682964687621265, + "grad_norm": 0.27032899856567383, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 880 + }, + { + "epoch": 0.6907256499805976, + "grad_norm": 0.29031234979629517, + "learning_rate": 0.0002, + "loss": 1.8423, + "step": 890 + }, + { + "epoch": 0.6984866123399301, + "grad_norm": 0.2845142185688019, + "learning_rate": 0.0002, + "loss": 1.835, + "step": 900 + }, + { + "epoch": 0.7062475746992627, + "grad_norm": 0.8638312816619873, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 910 + }, + { + "epoch": 0.7140085370585952, + "grad_norm": 0.3086668848991394, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 920 + }, + { + "epoch": 0.7217694994179278, + "grad_norm": 0.2724177837371826, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 930 + }, + { + "epoch": 0.7295304617772603, + "grad_norm": 0.289559006690979, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 940 + }, + { + "epoch": 0.737291424136593, + "grad_norm": 0.3000658452510834, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 950 + }, + { + "epoch": 0.7450523864959255, + "grad_norm": 0.33544042706489563, + "learning_rate": 0.0002, + "loss": 1.7736, + "step": 960 + }, + { + "epoch": 0.7528133488552581, + "grad_norm": 0.28593236207962036, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 970 + }, + { + "epoch": 0.7605743112145906, + "grad_norm": 0.313634991645813, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 980 + }, + { + "epoch": 0.7683352735739232, + "grad_norm": 0.2949385941028595, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 990 + }, + { + "epoch": 0.7760962359332557, + "grad_norm": 0.2920108437538147, + "learning_rate": 0.0002, + "loss": 1.8689, + "step": 1000 + }, + { + "epoch": 0.7838571982925883, + "grad_norm": 0.3245100677013397, + "learning_rate": 0.0002, + "loss": 1.8401, + "step": 1010 + }, + { + "epoch": 0.7916181606519208, + "grad_norm": 0.3007619380950928, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 1020 + }, + { + "epoch": 0.7993791230112534, + "grad_norm": 0.3630852997303009, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 1030 + }, + { + "epoch": 0.8071400853705859, + "grad_norm": 0.2856379747390747, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 1040 + }, + { + "epoch": 0.8149010477299186, + "grad_norm": 0.32476478815078735, + "learning_rate": 0.0002, + "loss": 1.8371, + "step": 1050 + }, + { + "epoch": 0.8226620100892511, + "grad_norm": 0.5162565112113953, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 1060 + }, + { + "epoch": 0.8304229724485837, + "grad_norm": 0.316496342420578, + "learning_rate": 0.0002, + "loss": 1.8862, + "step": 1070 + }, + { + "epoch": 0.8381839348079162, + "grad_norm": 0.31977516412734985, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 1080 + }, + { + "epoch": 0.8459448971672487, + "grad_norm": 0.269509494304657, + "learning_rate": 0.0002, + "loss": 1.8547, + "step": 1090 + }, + { + "epoch": 0.8537058595265813, + "grad_norm": 0.31621453166007996, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 1100 + }, + { + "epoch": 0.8614668218859138, + "grad_norm": 0.2946535050868988, + "learning_rate": 0.0002, + "loss": 1.739, + "step": 1110 + }, + { + "epoch": 0.8692277842452464, + "grad_norm": 0.3088909983634949, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 1120 + }, + { + "epoch": 0.8769887466045789, + "grad_norm": 0.33033716678619385, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 1130 + }, + { + "epoch": 0.8847497089639115, + "grad_norm": 0.2954833507537842, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1140 + }, + { + "epoch": 0.8925106713232441, + "grad_norm": 0.2950248122215271, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 1150 + }, + { + "epoch": 0.9002716336825767, + "grad_norm": 0.296661913394928, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 1160 + }, + { + "epoch": 0.9080325960419092, + "grad_norm": 0.35451310873031616, + "learning_rate": 0.0002, + "loss": 1.7967, + "step": 1170 + }, + { + "epoch": 0.9157935584012418, + "grad_norm": 0.32705947756767273, + "learning_rate": 0.0002, + "loss": 1.8202, + "step": 1180 + }, + { + "epoch": 0.9235545207605743, + "grad_norm": 0.3333960771560669, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1190 + }, + { + "epoch": 0.9313154831199069, + "grad_norm": 0.3042232096195221, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 1200 + }, + { + "epoch": 0.9390764454792394, + "grad_norm": 0.281553715467453, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1210 + }, + { + "epoch": 0.946837407838572, + "grad_norm": 0.3096391558647156, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 1220 + }, + { + "epoch": 0.9545983701979045, + "grad_norm": 0.2866271734237671, + "learning_rate": 0.0002, + "loss": 1.7401, + "step": 1230 + }, + { + "epoch": 0.962359332557237, + "grad_norm": 0.28394097089767456, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 1240 + }, + { + "epoch": 0.9701202949165697, + "grad_norm": 0.3249266743659973, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 1250 + }, + { + "epoch": 0.9778812572759022, + "grad_norm": 0.2896869480609894, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1260 + }, + { + "epoch": 0.9856422196352348, + "grad_norm": 0.29224586486816406, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 1270 + }, + { + "epoch": 0.9934031819945673, + "grad_norm": 0.2820223569869995, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1280 + }, + { + "epoch": 0.9996119518820333, + "eval_loss": 1.8081045150756836, + "eval_runtime": 102.3056, + "eval_samples_per_second": 4.956, + "eval_steps_per_second": 0.626, + "step": 1288 + }, + { + "epoch": 1.0011641443538999, + "grad_norm": 0.3282551169395447, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 1290 + }, + { + "epoch": 1.0089251067132325, + "grad_norm": 0.30217495560646057, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1300 + }, + { + "epoch": 1.016686069072565, + "grad_norm": 0.30801767110824585, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 1310 + }, + { + "epoch": 1.0244470314318976, + "grad_norm": 0.31816792488098145, + "learning_rate": 0.0002, + "loss": 1.7756, + "step": 1320 + }, + { + "epoch": 1.03220799379123, + "grad_norm": 0.27794334292411804, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 1330 + }, + { + "epoch": 1.0399689561505627, + "grad_norm": 0.3018926680088043, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 1340 + }, + { + "epoch": 1.0477299185098952, + "grad_norm": 0.3552975356578827, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1350 + }, + { + "epoch": 1.0554908808692278, + "grad_norm": 0.32590144872665405, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 1360 + }, + { + "epoch": 1.0632518432285603, + "grad_norm": 0.3435460925102234, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 1370 + }, + { + "epoch": 1.071012805587893, + "grad_norm": 0.35037797689437866, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 1380 + }, + { + "epoch": 1.0787737679472253, + "grad_norm": 0.31398263573646545, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 1390 + }, + { + "epoch": 1.086534730306558, + "grad_norm": 0.3134010434150696, + "learning_rate": 0.0002, + "loss": 1.6729, + "step": 1400 + }, + { + "epoch": 1.0942956926658907, + "grad_norm": 0.4599704444408417, + "learning_rate": 0.0002, + "loss": 1.751, + "step": 1410 + }, + { + "epoch": 1.102056655025223, + "grad_norm": 0.35852891206741333, + "learning_rate": 0.0002, + "loss": 1.6871, + "step": 1420 + }, + { + "epoch": 1.1098176173845558, + "grad_norm": 0.35628634691238403, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 1430 + }, + { + "epoch": 1.1175785797438882, + "grad_norm": 0.3769161105155945, + "learning_rate": 0.0002, + "loss": 1.6166, + "step": 1440 + }, + { + "epoch": 1.1253395421032208, + "grad_norm": 1.3712416887283325, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1450 + }, + { + "epoch": 1.1331005044625533, + "grad_norm": 0.38406670093536377, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 1460 + }, + { + "epoch": 1.140861466821886, + "grad_norm": 0.3402116000652313, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 1470 + }, + { + "epoch": 1.1486224291812184, + "grad_norm": 0.341189444065094, + "learning_rate": 0.0002, + "loss": 1.7074, + "step": 1480 + }, + { + "epoch": 1.156383391540551, + "grad_norm": 0.36629995703697205, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 1490 + }, + { + "epoch": 1.1641443538998835, + "grad_norm": 0.3499569296836853, + "learning_rate": 0.0002, + "loss": 1.6952, + "step": 1500 + }, + { + "epoch": 1.1719053162592161, + "grad_norm": 0.3663063943386078, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 1510 + }, + { + "epoch": 1.1796662786185488, + "grad_norm": 0.34851500391960144, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1520 + }, + { + "epoch": 1.1874272409778812, + "grad_norm": 0.35071656107902527, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 1530 + }, + { + "epoch": 1.1951882033372139, + "grad_norm": 0.42783796787261963, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 1540 + }, + { + "epoch": 1.2029491656965463, + "grad_norm": 0.31830692291259766, + "learning_rate": 0.0002, + "loss": 1.7499, + "step": 1550 + }, + { + "epoch": 1.210710128055879, + "grad_norm": 0.3597424626350403, + "learning_rate": 0.0002, + "loss": 1.7372, + "step": 1560 + }, + { + "epoch": 1.2184710904152114, + "grad_norm": 0.35233765840530396, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 1570 + }, + { + "epoch": 1.226232052774544, + "grad_norm": 0.35942912101745605, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 1580 + }, + { + "epoch": 1.2339930151338767, + "grad_norm": 0.36159393191337585, + "learning_rate": 0.0002, + "loss": 1.6598, + "step": 1590 + }, + { + "epoch": 1.2417539774932091, + "grad_norm": 0.3328469693660736, + "learning_rate": 0.0002, + "loss": 1.6697, + "step": 1600 + }, + { + "epoch": 1.2495149398525418, + "grad_norm": 0.3089476525783539, + "learning_rate": 0.0002, + "loss": 1.7594, + "step": 1610 + }, + { + "epoch": 1.2572759022118742, + "grad_norm": 0.30947765707969666, + "learning_rate": 0.0002, + "loss": 1.6805, + "step": 1620 + }, + { + "epoch": 1.265036864571207, + "grad_norm": 0.32154011726379395, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 1630 + }, + { + "epoch": 1.2727978269305393, + "grad_norm": 0.3480297923088074, + "learning_rate": 0.0002, + "loss": 1.6621, + "step": 1640 + }, + { + "epoch": 1.280558789289872, + "grad_norm": 0.39471694827079773, + "learning_rate": 0.0002, + "loss": 1.7087, + "step": 1650 + }, + { + "epoch": 1.2883197516492044, + "grad_norm": 0.35728853940963745, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 1660 + }, + { + "epoch": 1.296080714008537, + "grad_norm": 0.35223081707954407, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 1670 + }, + { + "epoch": 1.3038416763678695, + "grad_norm": 0.3588867485523224, + "learning_rate": 0.0002, + "loss": 1.7253, + "step": 1680 + }, + { + "epoch": 1.3116026387272022, + "grad_norm": 0.3528042733669281, + "learning_rate": 0.0002, + "loss": 1.6505, + "step": 1690 + }, + { + "epoch": 1.3193636010865348, + "grad_norm": 0.35975801944732666, + "learning_rate": 0.0002, + "loss": 1.6945, + "step": 1700 + }, + { + "epoch": 1.3271245634458673, + "grad_norm": 0.36691880226135254, + "learning_rate": 0.0002, + "loss": 1.6631, + "step": 1710 + }, + { + "epoch": 1.3348855258052, + "grad_norm": 0.3787977695465088, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1720 + }, + { + "epoch": 1.3426464881645324, + "grad_norm": 0.36614933609962463, + "learning_rate": 0.0002, + "loss": 1.7697, + "step": 1730 + }, + { + "epoch": 1.350407450523865, + "grad_norm": 0.3484745919704437, + "learning_rate": 0.0002, + "loss": 1.6487, + "step": 1740 + }, + { + "epoch": 1.3581684128831975, + "grad_norm": 0.36905673146247864, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 1750 + }, + { + "epoch": 1.36592937524253, + "grad_norm": 0.41564738750457764, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 1760 + }, + { + "epoch": 1.3736903376018628, + "grad_norm": 0.3345205783843994, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 1770 + }, + { + "epoch": 1.3814512999611952, + "grad_norm": 0.34926071763038635, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 1780 + }, + { + "epoch": 1.3892122623205276, + "grad_norm": 0.42004233598709106, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 1790 + }, + { + "epoch": 1.3969732246798603, + "grad_norm": 0.3576236963272095, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 1800 + }, + { + "epoch": 1.404734187039193, + "grad_norm": 0.3586704432964325, + "learning_rate": 0.0002, + "loss": 1.8516, + "step": 1810 + }, + { + "epoch": 1.4124951493985254, + "grad_norm": 0.3943439722061157, + "learning_rate": 0.0002, + "loss": 1.6171, + "step": 1820 + }, + { + "epoch": 1.420256111757858, + "grad_norm": 0.3484877049922943, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 1830 + }, + { + "epoch": 1.4280170741171905, + "grad_norm": 0.3344518840312958, + "learning_rate": 0.0002, + "loss": 1.7205, + "step": 1840 + }, + { + "epoch": 1.4357780364765231, + "grad_norm": 0.4345698356628418, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1850 + }, + { + "epoch": 1.4435389988358556, + "grad_norm": 0.5525162220001221, + "learning_rate": 0.0002, + "loss": 1.6855, + "step": 1860 + }, + { + "epoch": 1.4512999611951882, + "grad_norm": 0.37194496393203735, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1870 + }, + { + "epoch": 1.4590609235545209, + "grad_norm": 0.34570157527923584, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1880 + }, + { + "epoch": 1.4668218859138533, + "grad_norm": 0.3512282073497772, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 1890 + }, + { + "epoch": 1.4745828482731858, + "grad_norm": 0.3443922996520996, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1900 + }, + { + "epoch": 1.4823438106325184, + "grad_norm": 0.3812018036842346, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 1910 + }, + { + "epoch": 1.490104772991851, + "grad_norm": 0.39263492822647095, + "learning_rate": 0.0002, + "loss": 1.7277, + "step": 1920 + }, + { + "epoch": 1.4978657353511835, + "grad_norm": 0.3146156072616577, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1930 + }, + { + "epoch": 1.505626697710516, + "grad_norm": 0.3653988540172577, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 1940 + }, + { + "epoch": 1.5133876600698488, + "grad_norm": 0.3966596722602844, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 1950 + }, + { + "epoch": 1.5211486224291813, + "grad_norm": 0.3441697359085083, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1960 + }, + { + "epoch": 1.5289095847885137, + "grad_norm": 0.3328564465045929, + "learning_rate": 0.0002, + "loss": 1.7175, + "step": 1970 + }, + { + "epoch": 1.5366705471478463, + "grad_norm": 0.34068772196769714, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 1980 + }, + { + "epoch": 1.544431509507179, + "grad_norm": 0.3559795916080475, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 1990 + }, + { + "epoch": 1.5521924718665114, + "grad_norm": 0.37888768315315247, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 2000 + }, + { + "epoch": 1.5599534342258439, + "grad_norm": 0.36128363013267517, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 2010 + }, + { + "epoch": 1.5677143965851765, + "grad_norm": 0.3643714487552643, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 2020 + }, + { + "epoch": 1.5754753589445092, + "grad_norm": 0.3863612115383148, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 2030 + }, + { + "epoch": 1.5832363213038416, + "grad_norm": 0.32831457257270813, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 2040 + }, + { + "epoch": 1.5909972836631743, + "grad_norm": 0.36098113656044006, + "learning_rate": 0.0002, + "loss": 1.7404, + "step": 2050 + }, + { + "epoch": 1.598758246022507, + "grad_norm": 1.1079334020614624, + "learning_rate": 0.0002, + "loss": 1.7065, + "step": 2060 + }, + { + "epoch": 1.6065192083818394, + "grad_norm": 0.35615381598472595, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2070 + }, + { + "epoch": 1.6142801707411718, + "grad_norm": 0.369711309671402, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 2080 + }, + { + "epoch": 1.6220411331005045, + "grad_norm": 0.390658438205719, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 2090 + }, + { + "epoch": 1.6298020954598371, + "grad_norm": 0.3422999382019043, + "learning_rate": 0.0002, + "loss": 1.6996, + "step": 2100 + }, + { + "epoch": 1.6375630578191696, + "grad_norm": 0.372475266456604, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 2110 + }, + { + "epoch": 1.645324020178502, + "grad_norm": 0.35660576820373535, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 2120 + }, + { + "epoch": 1.6530849825378346, + "grad_norm": 0.35754942893981934, + "learning_rate": 0.0002, + "loss": 1.6991, + "step": 2130 + }, + { + "epoch": 1.6608459448971673, + "grad_norm": 0.34572410583496094, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 2140 + }, + { + "epoch": 1.6686069072564997, + "grad_norm": 0.42059701681137085, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 2150 + }, + { + "epoch": 1.6763678696158324, + "grad_norm": 0.35200759768486023, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 2160 + }, + { + "epoch": 1.684128831975165, + "grad_norm": 0.3704029321670532, + "learning_rate": 0.0002, + "loss": 1.6869, + "step": 2170 + }, + { + "epoch": 1.6918897943344975, + "grad_norm": 0.40450501441955566, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 2180 + }, + { + "epoch": 1.69965075669383, + "grad_norm": 0.362966924905777, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2190 + }, + { + "epoch": 1.7074117190531626, + "grad_norm": 0.36586204171180725, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 2200 + }, + { + "epoch": 1.7151726814124952, + "grad_norm": 0.3295372426509857, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 2210 + }, + { + "epoch": 1.7229336437718277, + "grad_norm": 0.3892575800418854, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 2220 + }, + { + "epoch": 1.73069460613116, + "grad_norm": 0.34712135791778564, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 2230 + }, + { + "epoch": 1.738455568490493, + "grad_norm": 0.34801796078681946, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 2240 + }, + { + "epoch": 1.7462165308498254, + "grad_norm": 0.3822397291660309, + "learning_rate": 0.0002, + "loss": 1.7009, + "step": 2250 + }, + { + "epoch": 1.7539774932091579, + "grad_norm": 0.38933250308036804, + "learning_rate": 0.0002, + "loss": 1.6546, + "step": 2260 + }, + { + "epoch": 1.7617384555684905, + "grad_norm": 0.3798373341560364, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 2270 + }, + { + "epoch": 1.7694994179278232, + "grad_norm": 0.35151317715644836, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 2280 + }, + { + "epoch": 1.7772603802871556, + "grad_norm": 0.44981494545936584, + "learning_rate": 0.0002, + "loss": 1.6894, + "step": 2290 + }, + { + "epoch": 1.785021342646488, + "grad_norm": 0.3992624580860138, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 2300 + }, + { + "epoch": 1.7927823050058207, + "grad_norm": 0.3772512376308441, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 2310 + }, + { + "epoch": 1.8005432673651534, + "grad_norm": 0.3511589467525482, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2320 + }, + { + "epoch": 1.8083042297244858, + "grad_norm": 0.3805285394191742, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 2330 + }, + { + "epoch": 1.8160651920838184, + "grad_norm": 0.3792071044445038, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 2340 + }, + { + "epoch": 1.823826154443151, + "grad_norm": 0.36430829763412476, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 2350 + }, + { + "epoch": 1.8315871168024835, + "grad_norm": 0.36502477526664734, + "learning_rate": 0.0002, + "loss": 1.6773, + "step": 2360 + }, + { + "epoch": 1.839348079161816, + "grad_norm": 0.35015153884887695, + "learning_rate": 0.0002, + "loss": 1.8072, + "step": 2370 + }, + { + "epoch": 1.8471090415211486, + "grad_norm": 0.3710903823375702, + "learning_rate": 0.0002, + "loss": 1.7734, + "step": 2380 + }, + { + "epoch": 1.8548700038804813, + "grad_norm": 0.3542828857898712, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 2390 + }, + { + "epoch": 1.8626309662398137, + "grad_norm": 0.35467568039894104, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 2400 + }, + { + "epoch": 1.8703919285991462, + "grad_norm": 0.3638560473918915, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 2410 + }, + { + "epoch": 1.8781528909584788, + "grad_norm": 0.3823298215866089, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 2420 + }, + { + "epoch": 1.8859138533178115, + "grad_norm": 0.3926416337490082, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 2430 + }, + { + "epoch": 1.893674815677144, + "grad_norm": 0.3608079254627228, + "learning_rate": 0.0002, + "loss": 1.71, + "step": 2440 + }, + { + "epoch": 1.9014357780364766, + "grad_norm": 0.3426613509654999, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 2450 + }, + { + "epoch": 1.9091967403958092, + "grad_norm": 0.3522338569164276, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 2460 + }, + { + "epoch": 1.9169577027551417, + "grad_norm": 0.3608049154281616, + "learning_rate": 0.0002, + "loss": 1.7307, + "step": 2470 + }, + { + "epoch": 1.924718665114474, + "grad_norm": 0.3849755525588989, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 2480 + }, + { + "epoch": 1.9324796274738067, + "grad_norm": 0.4154011011123657, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 2490 + }, + { + "epoch": 1.9402405898331394, + "grad_norm": 0.3602796792984009, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 2500 + }, + { + "epoch": 1.9480015521924718, + "grad_norm": 0.3702992796897888, + "learning_rate": 0.0002, + "loss": 1.7843, + "step": 2510 + }, + { + "epoch": 1.9557625145518043, + "grad_norm": 0.3657735288143158, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 2520 + }, + { + "epoch": 1.963523476911137, + "grad_norm": 0.41031739115715027, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2530 + }, + { + "epoch": 1.9712844392704696, + "grad_norm": 0.34578680992126465, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 2540 + }, + { + "epoch": 1.979045401629802, + "grad_norm": 0.3361521065235138, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 2550 + }, + { + "epoch": 1.9868063639891347, + "grad_norm": 0.34342363476753235, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 2560 + }, + { + "epoch": 1.9945673263484673, + "grad_norm": 0.32954007387161255, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 2570 + }, + { + "epoch": 2.0, + "eval_loss": 1.8068748712539673, + "eval_runtime": 105.5885, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 2577 + } + ], + "logging_steps": 10, + "max_steps": 10304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3236437248376832e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8ca6e2c3ac58fa2af9f99747566f932f41a5a4d5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f7feb06ff53d5bf79374054a25b662309e705a2ca08dfa3b0bce7b8b4632fae +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2786b01ff7eed83657f3076910e481f97a6b9d97 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f51b151c750ccb60ec370ef0a0102921296969c48183896477de09e0fad7ae4 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..148f058292122a4256bf08a42124876ee21246fa --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82e48ecfc9ee6499497242616ba9c97813a50711664bfef1f982ff4a5b773e01 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f8ca8dba68bda9a461f59de0c7c96e8f29fb78ad --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc96eed9e4fc239cc5ce62a654c9651acec2fda09e9f5ff69a92e5f6c0c9741c +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e88deae00cb5a1806f83d7e94c6d37bada812744 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b2ce4676fa126c497a7bdef3c433de52a10b5a4190bb774d68ef751fa2b4e2c +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5baed5ecb767c4b2e4ce4448030514f3427f5919 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/trainer_state.json @@ -0,0 +1,2759 @@ +{ + "best_metric": 1.8068748712539673, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577", + "epoch": 2.9996119518820334, + "eval_steps": 10, + "global_step": 3865, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007760962359332557, + "grad_norm": 1.0751162767410278, + "learning_rate": 0.0002, + "loss": 3.0855, + "step": 10 + }, + { + "epoch": 0.015521924718665115, + "grad_norm": 0.4697345793247223, + "learning_rate": 0.0002, + "loss": 2.4744, + "step": 20 + }, + { + "epoch": 0.023282887077997673, + "grad_norm": 0.5370839238166809, + "learning_rate": 0.0002, + "loss": 2.193, + "step": 30 + }, + { + "epoch": 0.03104384943733023, + "grad_norm": 0.46794816851615906, + "learning_rate": 0.0002, + "loss": 2.0599, + "step": 40 + }, + { + "epoch": 0.038804811796662786, + "grad_norm": 0.44624820351600647, + "learning_rate": 0.0002, + "loss": 1.9354, + "step": 50 + }, + { + "epoch": 0.046565774155995346, + "grad_norm": 0.3953201472759247, + "learning_rate": 0.0002, + "loss": 1.9319, + "step": 60 + }, + { + "epoch": 0.0543267365153279, + "grad_norm": 0.3935912549495697, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 70 + }, + { + "epoch": 0.06208769887466046, + "grad_norm": 0.4520699381828308, + "learning_rate": 0.0002, + "loss": 1.8795, + "step": 80 + }, + { + "epoch": 0.06984866123399301, + "grad_norm": 0.3801847994327545, + "learning_rate": 0.0002, + "loss": 1.8354, + "step": 90 + }, + { + "epoch": 0.07760962359332557, + "grad_norm": 0.4020165205001831, + "learning_rate": 0.0002, + "loss": 1.9053, + "step": 100 + }, + { + "epoch": 0.08537058595265813, + "grad_norm": 0.3860672116279602, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 110 + }, + { + "epoch": 0.09313154831199069, + "grad_norm": 0.3681113123893738, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 120 + }, + { + "epoch": 0.10089251067132324, + "grad_norm": 0.3594866991043091, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 130 + }, + { + "epoch": 0.1086534730306558, + "grad_norm": 0.3879193663597107, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 140 + }, + { + "epoch": 0.11641443538998836, + "grad_norm": 0.3270505666732788, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 150 + }, + { + "epoch": 0.12417539774932092, + "grad_norm": 0.36824458837509155, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 160 + }, + { + "epoch": 0.13193636010865348, + "grad_norm": 0.383882075548172, + "learning_rate": 0.0002, + "loss": 1.8305, + "step": 170 + }, + { + "epoch": 0.13969732246798602, + "grad_norm": 0.3368665874004364, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 180 + }, + { + "epoch": 0.1474582848273186, + "grad_norm": 0.35961097478866577, + "learning_rate": 0.0002, + "loss": 1.7882, + "step": 190 + }, + { + "epoch": 0.15521924718665114, + "grad_norm": 0.3415963351726532, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 200 + }, + { + "epoch": 0.1629802095459837, + "grad_norm": 0.4100632071495056, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 210 + }, + { + "epoch": 0.17074117190531626, + "grad_norm": 0.3516307473182678, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 220 + }, + { + "epoch": 0.1785021342646488, + "grad_norm": 0.37919050455093384, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 230 + }, + { + "epoch": 0.18626309662398138, + "grad_norm": 0.33270683884620667, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 240 + }, + { + "epoch": 0.19402405898331393, + "grad_norm": 0.3348783254623413, + "learning_rate": 0.0002, + "loss": 1.7925, + "step": 250 + }, + { + "epoch": 0.20178502134264648, + "grad_norm": 0.3888475298881531, + "learning_rate": 0.0002, + "loss": 1.7774, + "step": 260 + }, + { + "epoch": 0.20954598370197905, + "grad_norm": 0.3554602861404419, + "learning_rate": 0.0002, + "loss": 1.8381, + "step": 270 + }, + { + "epoch": 0.2173069460613116, + "grad_norm": 0.33277708292007446, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 280 + }, + { + "epoch": 0.22506790842064417, + "grad_norm": 0.3281584680080414, + "learning_rate": 0.0002, + "loss": 1.7713, + "step": 290 + }, + { + "epoch": 0.23282887077997672, + "grad_norm": 0.3185969591140747, + "learning_rate": 0.0002, + "loss": 1.8181, + "step": 300 + }, + { + "epoch": 0.24058983313930926, + "grad_norm": 0.35335442423820496, + "learning_rate": 0.0002, + "loss": 1.8595, + "step": 310 + }, + { + "epoch": 0.24835079549864184, + "grad_norm": 0.3119595944881439, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 320 + }, + { + "epoch": 0.2561117578579744, + "grad_norm": 0.36424458026885986, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 330 + }, + { + "epoch": 0.26387272021730696, + "grad_norm": 0.3618951141834259, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 340 + }, + { + "epoch": 0.2716336825766395, + "grad_norm": 0.312757670879364, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 350 + }, + { + "epoch": 0.27939464493597205, + "grad_norm": 0.326016366481781, + "learning_rate": 0.0002, + "loss": 1.9031, + "step": 360 + }, + { + "epoch": 0.2871556072953046, + "grad_norm": 0.34093883633613586, + "learning_rate": 0.0002, + "loss": 1.8214, + "step": 370 + }, + { + "epoch": 0.2949165696546372, + "grad_norm": 0.32325029373168945, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 380 + }, + { + "epoch": 0.30267753201396974, + "grad_norm": 0.34105437994003296, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 390 + }, + { + "epoch": 0.3104384943733023, + "grad_norm": 0.32565295696258545, + "learning_rate": 0.0002, + "loss": 1.7926, + "step": 400 + }, + { + "epoch": 0.31819945673263483, + "grad_norm": 0.32742050290107727, + "learning_rate": 0.0002, + "loss": 1.8031, + "step": 410 + }, + { + "epoch": 0.3259604190919674, + "grad_norm": 0.30233046412467957, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 420 + }, + { + "epoch": 0.3337213814513, + "grad_norm": 0.32419222593307495, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 430 + }, + { + "epoch": 0.3414823438106325, + "grad_norm": 0.3653007745742798, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 440 + }, + { + "epoch": 0.3492433061699651, + "grad_norm": 0.31617099046707153, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 450 + }, + { + "epoch": 0.3570042685292976, + "grad_norm": 0.3305962085723877, + "learning_rate": 0.0002, + "loss": 1.7677, + "step": 460 + }, + { + "epoch": 0.36476523088863017, + "grad_norm": 0.3178933262825012, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 470 + }, + { + "epoch": 0.37252619324796277, + "grad_norm": 0.37163782119750977, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 480 + }, + { + "epoch": 0.3802871556072953, + "grad_norm": 0.469844788312912, + "learning_rate": 0.0002, + "loss": 1.8804, + "step": 490 + }, + { + "epoch": 0.38804811796662786, + "grad_norm": 0.3409338593482971, + "learning_rate": 0.0002, + "loss": 1.8343, + "step": 500 + }, + { + "epoch": 0.3958090803259604, + "grad_norm": 0.31943467259407043, + "learning_rate": 0.0002, + "loss": 1.8433, + "step": 510 + }, + { + "epoch": 0.40357004268529295, + "grad_norm": 0.32293614745140076, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 520 + }, + { + "epoch": 0.41133100504462555, + "grad_norm": 0.2994382977485657, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 530 + }, + { + "epoch": 0.4190919674039581, + "grad_norm": 0.3273141384124756, + "learning_rate": 0.0002, + "loss": 1.8153, + "step": 540 + }, + { + "epoch": 0.42685292976329064, + "grad_norm": 0.3020550012588501, + "learning_rate": 0.0002, + "loss": 1.8097, + "step": 550 + }, + { + "epoch": 0.4346138921226232, + "grad_norm": 0.30113112926483154, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 560 + }, + { + "epoch": 0.44237485448195574, + "grad_norm": 0.30274903774261475, + "learning_rate": 0.0002, + "loss": 1.8084, + "step": 570 + }, + { + "epoch": 0.45013581684128834, + "grad_norm": 0.3231128454208374, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 580 + }, + { + "epoch": 0.4578967792006209, + "grad_norm": 0.3255121409893036, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 590 + }, + { + "epoch": 0.46565774155995343, + "grad_norm": 0.30147507786750793, + "learning_rate": 0.0002, + "loss": 1.8227, + "step": 600 + }, + { + "epoch": 0.473418703919286, + "grad_norm": 0.29781386256217957, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 610 + }, + { + "epoch": 0.4811796662786185, + "grad_norm": 0.30914685130119324, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 620 + }, + { + "epoch": 0.4889406286379511, + "grad_norm": 0.3110593855381012, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 630 + }, + { + "epoch": 0.49670159099728367, + "grad_norm": 0.3298132121562958, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 640 + }, + { + "epoch": 0.5044625533566163, + "grad_norm": 0.322122186422348, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 650 + }, + { + "epoch": 0.5122235157159488, + "grad_norm": 0.3504371643066406, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 660 + }, + { + "epoch": 0.5199844780752814, + "grad_norm": 0.3102182149887085, + "learning_rate": 0.0002, + "loss": 1.8682, + "step": 670 + }, + { + "epoch": 0.5277454404346139, + "grad_norm": 0.6113658547401428, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 680 + }, + { + "epoch": 0.5355064027939465, + "grad_norm": 0.31841862201690674, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 690 + }, + { + "epoch": 0.543267365153279, + "grad_norm": 0.2830526530742645, + "learning_rate": 0.0002, + "loss": 1.7587, + "step": 700 + }, + { + "epoch": 0.5510283275126115, + "grad_norm": 0.3048769533634186, + "learning_rate": 0.0002, + "loss": 1.7887, + "step": 710 + }, + { + "epoch": 0.5587892898719441, + "grad_norm": 0.2719033658504486, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 720 + }, + { + "epoch": 0.5665502522312766, + "grad_norm": 0.3176722526550293, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 730 + }, + { + "epoch": 0.5743112145906092, + "grad_norm": 0.32491734623908997, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 740 + }, + { + "epoch": 0.5820721769499418, + "grad_norm": 0.32746851444244385, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 750 + }, + { + "epoch": 0.5898331393092744, + "grad_norm": 0.3055773973464966, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 760 + }, + { + "epoch": 0.5975941016686069, + "grad_norm": 0.30671584606170654, + "learning_rate": 0.0002, + "loss": 1.8597, + "step": 770 + }, + { + "epoch": 0.6053550640279395, + "grad_norm": 0.28770264983177185, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 780 + }, + { + "epoch": 0.613116026387272, + "grad_norm": 0.2814285457134247, + "learning_rate": 0.0002, + "loss": 1.7025, + "step": 790 + }, + { + "epoch": 0.6208769887466046, + "grad_norm": 0.31554412841796875, + "learning_rate": 0.0002, + "loss": 1.819, + "step": 800 + }, + { + "epoch": 0.6286379511059371, + "grad_norm": 0.2984226942062378, + "learning_rate": 0.0002, + "loss": 1.8335, + "step": 810 + }, + { + "epoch": 0.6363989134652697, + "grad_norm": 0.2859906554222107, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 820 + }, + { + "epoch": 0.6441598758246022, + "grad_norm": 0.2887928783893585, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 830 + }, + { + "epoch": 0.6519208381839348, + "grad_norm": 0.31287339329719543, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 840 + }, + { + "epoch": 0.6596818005432674, + "grad_norm": 0.32064181566238403, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 850 + }, + { + "epoch": 0.6674427629026, + "grad_norm": 0.290981650352478, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 860 + }, + { + "epoch": 0.6752037252619325, + "grad_norm": 0.33060121536254883, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 870 + }, + { + "epoch": 0.682964687621265, + "grad_norm": 0.27032899856567383, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 880 + }, + { + "epoch": 0.6907256499805976, + "grad_norm": 0.29031234979629517, + "learning_rate": 0.0002, + "loss": 1.8423, + "step": 890 + }, + { + "epoch": 0.6984866123399301, + "grad_norm": 0.2845142185688019, + "learning_rate": 0.0002, + "loss": 1.835, + "step": 900 + }, + { + "epoch": 0.7062475746992627, + "grad_norm": 0.8638312816619873, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 910 + }, + { + "epoch": 0.7140085370585952, + "grad_norm": 0.3086668848991394, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 920 + }, + { + "epoch": 0.7217694994179278, + "grad_norm": 0.2724177837371826, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 930 + }, + { + "epoch": 0.7295304617772603, + "grad_norm": 0.289559006690979, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 940 + }, + { + "epoch": 0.737291424136593, + "grad_norm": 0.3000658452510834, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 950 + }, + { + "epoch": 0.7450523864959255, + "grad_norm": 0.33544042706489563, + "learning_rate": 0.0002, + "loss": 1.7736, + "step": 960 + }, + { + "epoch": 0.7528133488552581, + "grad_norm": 0.28593236207962036, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 970 + }, + { + "epoch": 0.7605743112145906, + "grad_norm": 0.313634991645813, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 980 + }, + { + "epoch": 0.7683352735739232, + "grad_norm": 0.2949385941028595, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 990 + }, + { + "epoch": 0.7760962359332557, + "grad_norm": 0.2920108437538147, + "learning_rate": 0.0002, + "loss": 1.8689, + "step": 1000 + }, + { + "epoch": 0.7838571982925883, + "grad_norm": 0.3245100677013397, + "learning_rate": 0.0002, + "loss": 1.8401, + "step": 1010 + }, + { + "epoch": 0.7916181606519208, + "grad_norm": 0.3007619380950928, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 1020 + }, + { + "epoch": 0.7993791230112534, + "grad_norm": 0.3630852997303009, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 1030 + }, + { + "epoch": 0.8071400853705859, + "grad_norm": 0.2856379747390747, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 1040 + }, + { + "epoch": 0.8149010477299186, + "grad_norm": 0.32476478815078735, + "learning_rate": 0.0002, + "loss": 1.8371, + "step": 1050 + }, + { + "epoch": 0.8226620100892511, + "grad_norm": 0.5162565112113953, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 1060 + }, + { + "epoch": 0.8304229724485837, + "grad_norm": 0.316496342420578, + "learning_rate": 0.0002, + "loss": 1.8862, + "step": 1070 + }, + { + "epoch": 0.8381839348079162, + "grad_norm": 0.31977516412734985, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 1080 + }, + { + "epoch": 0.8459448971672487, + "grad_norm": 0.269509494304657, + "learning_rate": 0.0002, + "loss": 1.8547, + "step": 1090 + }, + { + "epoch": 0.8537058595265813, + "grad_norm": 0.31621453166007996, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 1100 + }, + { + "epoch": 0.8614668218859138, + "grad_norm": 0.2946535050868988, + "learning_rate": 0.0002, + "loss": 1.739, + "step": 1110 + }, + { + "epoch": 0.8692277842452464, + "grad_norm": 0.3088909983634949, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 1120 + }, + { + "epoch": 0.8769887466045789, + "grad_norm": 0.33033716678619385, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 1130 + }, + { + "epoch": 0.8847497089639115, + "grad_norm": 0.2954833507537842, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1140 + }, + { + "epoch": 0.8925106713232441, + "grad_norm": 0.2950248122215271, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 1150 + }, + { + "epoch": 0.9002716336825767, + "grad_norm": 0.296661913394928, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 1160 + }, + { + "epoch": 0.9080325960419092, + "grad_norm": 0.35451310873031616, + "learning_rate": 0.0002, + "loss": 1.7967, + "step": 1170 + }, + { + "epoch": 0.9157935584012418, + "grad_norm": 0.32705947756767273, + "learning_rate": 0.0002, + "loss": 1.8202, + "step": 1180 + }, + { + "epoch": 0.9235545207605743, + "grad_norm": 0.3333960771560669, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1190 + }, + { + "epoch": 0.9313154831199069, + "grad_norm": 0.3042232096195221, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 1200 + }, + { + "epoch": 0.9390764454792394, + "grad_norm": 0.281553715467453, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1210 + }, + { + "epoch": 0.946837407838572, + "grad_norm": 0.3096391558647156, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 1220 + }, + { + "epoch": 0.9545983701979045, + "grad_norm": 0.2866271734237671, + "learning_rate": 0.0002, + "loss": 1.7401, + "step": 1230 + }, + { + "epoch": 0.962359332557237, + "grad_norm": 0.28394097089767456, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 1240 + }, + { + "epoch": 0.9701202949165697, + "grad_norm": 0.3249266743659973, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 1250 + }, + { + "epoch": 0.9778812572759022, + "grad_norm": 0.2896869480609894, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1260 + }, + { + "epoch": 0.9856422196352348, + "grad_norm": 0.29224586486816406, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 1270 + }, + { + "epoch": 0.9934031819945673, + "grad_norm": 0.2820223569869995, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1280 + }, + { + "epoch": 0.9996119518820333, + "eval_loss": 1.8081045150756836, + "eval_runtime": 102.3056, + "eval_samples_per_second": 4.956, + "eval_steps_per_second": 0.626, + "step": 1288 + }, + { + "epoch": 1.0011641443538999, + "grad_norm": 0.3282551169395447, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 1290 + }, + { + "epoch": 1.0089251067132325, + "grad_norm": 0.30217495560646057, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1300 + }, + { + "epoch": 1.016686069072565, + "grad_norm": 0.30801767110824585, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 1310 + }, + { + "epoch": 1.0244470314318976, + "grad_norm": 0.31816792488098145, + "learning_rate": 0.0002, + "loss": 1.7756, + "step": 1320 + }, + { + "epoch": 1.03220799379123, + "grad_norm": 0.27794334292411804, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 1330 + }, + { + "epoch": 1.0399689561505627, + "grad_norm": 0.3018926680088043, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 1340 + }, + { + "epoch": 1.0477299185098952, + "grad_norm": 0.3552975356578827, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1350 + }, + { + "epoch": 1.0554908808692278, + "grad_norm": 0.32590144872665405, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 1360 + }, + { + "epoch": 1.0632518432285603, + "grad_norm": 0.3435460925102234, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 1370 + }, + { + "epoch": 1.071012805587893, + "grad_norm": 0.35037797689437866, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 1380 + }, + { + "epoch": 1.0787737679472253, + "grad_norm": 0.31398263573646545, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 1390 + }, + { + "epoch": 1.086534730306558, + "grad_norm": 0.3134010434150696, + "learning_rate": 0.0002, + "loss": 1.6729, + "step": 1400 + }, + { + "epoch": 1.0942956926658907, + "grad_norm": 0.4599704444408417, + "learning_rate": 0.0002, + "loss": 1.751, + "step": 1410 + }, + { + "epoch": 1.102056655025223, + "grad_norm": 0.35852891206741333, + "learning_rate": 0.0002, + "loss": 1.6871, + "step": 1420 + }, + { + "epoch": 1.1098176173845558, + "grad_norm": 0.35628634691238403, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 1430 + }, + { + "epoch": 1.1175785797438882, + "grad_norm": 0.3769161105155945, + "learning_rate": 0.0002, + "loss": 1.6166, + "step": 1440 + }, + { + "epoch": 1.1253395421032208, + "grad_norm": 1.3712416887283325, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1450 + }, + { + "epoch": 1.1331005044625533, + "grad_norm": 0.38406670093536377, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 1460 + }, + { + "epoch": 1.140861466821886, + "grad_norm": 0.3402116000652313, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 1470 + }, + { + "epoch": 1.1486224291812184, + "grad_norm": 0.341189444065094, + "learning_rate": 0.0002, + "loss": 1.7074, + "step": 1480 + }, + { + "epoch": 1.156383391540551, + "grad_norm": 0.36629995703697205, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 1490 + }, + { + "epoch": 1.1641443538998835, + "grad_norm": 0.3499569296836853, + "learning_rate": 0.0002, + "loss": 1.6952, + "step": 1500 + }, + { + "epoch": 1.1719053162592161, + "grad_norm": 0.3663063943386078, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 1510 + }, + { + "epoch": 1.1796662786185488, + "grad_norm": 0.34851500391960144, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1520 + }, + { + "epoch": 1.1874272409778812, + "grad_norm": 0.35071656107902527, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 1530 + }, + { + "epoch": 1.1951882033372139, + "grad_norm": 0.42783796787261963, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 1540 + }, + { + "epoch": 1.2029491656965463, + "grad_norm": 0.31830692291259766, + "learning_rate": 0.0002, + "loss": 1.7499, + "step": 1550 + }, + { + "epoch": 1.210710128055879, + "grad_norm": 0.3597424626350403, + "learning_rate": 0.0002, + "loss": 1.7372, + "step": 1560 + }, + { + "epoch": 1.2184710904152114, + "grad_norm": 0.35233765840530396, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 1570 + }, + { + "epoch": 1.226232052774544, + "grad_norm": 0.35942912101745605, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 1580 + }, + { + "epoch": 1.2339930151338767, + "grad_norm": 0.36159393191337585, + "learning_rate": 0.0002, + "loss": 1.6598, + "step": 1590 + }, + { + "epoch": 1.2417539774932091, + "grad_norm": 0.3328469693660736, + "learning_rate": 0.0002, + "loss": 1.6697, + "step": 1600 + }, + { + "epoch": 1.2495149398525418, + "grad_norm": 0.3089476525783539, + "learning_rate": 0.0002, + "loss": 1.7594, + "step": 1610 + }, + { + "epoch": 1.2572759022118742, + "grad_norm": 0.30947765707969666, + "learning_rate": 0.0002, + "loss": 1.6805, + "step": 1620 + }, + { + "epoch": 1.265036864571207, + "grad_norm": 0.32154011726379395, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 1630 + }, + { + "epoch": 1.2727978269305393, + "grad_norm": 0.3480297923088074, + "learning_rate": 0.0002, + "loss": 1.6621, + "step": 1640 + }, + { + "epoch": 1.280558789289872, + "grad_norm": 0.39471694827079773, + "learning_rate": 0.0002, + "loss": 1.7087, + "step": 1650 + }, + { + "epoch": 1.2883197516492044, + "grad_norm": 0.35728853940963745, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 1660 + }, + { + "epoch": 1.296080714008537, + "grad_norm": 0.35223081707954407, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 1670 + }, + { + "epoch": 1.3038416763678695, + "grad_norm": 0.3588867485523224, + "learning_rate": 0.0002, + "loss": 1.7253, + "step": 1680 + }, + { + "epoch": 1.3116026387272022, + "grad_norm": 0.3528042733669281, + "learning_rate": 0.0002, + "loss": 1.6505, + "step": 1690 + }, + { + "epoch": 1.3193636010865348, + "grad_norm": 0.35975801944732666, + "learning_rate": 0.0002, + "loss": 1.6945, + "step": 1700 + }, + { + "epoch": 1.3271245634458673, + "grad_norm": 0.36691880226135254, + "learning_rate": 0.0002, + "loss": 1.6631, + "step": 1710 + }, + { + "epoch": 1.3348855258052, + "grad_norm": 0.3787977695465088, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1720 + }, + { + "epoch": 1.3426464881645324, + "grad_norm": 0.36614933609962463, + "learning_rate": 0.0002, + "loss": 1.7697, + "step": 1730 + }, + { + "epoch": 1.350407450523865, + "grad_norm": 0.3484745919704437, + "learning_rate": 0.0002, + "loss": 1.6487, + "step": 1740 + }, + { + "epoch": 1.3581684128831975, + "grad_norm": 0.36905673146247864, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 1750 + }, + { + "epoch": 1.36592937524253, + "grad_norm": 0.41564738750457764, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 1760 + }, + { + "epoch": 1.3736903376018628, + "grad_norm": 0.3345205783843994, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 1770 + }, + { + "epoch": 1.3814512999611952, + "grad_norm": 0.34926071763038635, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 1780 + }, + { + "epoch": 1.3892122623205276, + "grad_norm": 0.42004233598709106, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 1790 + }, + { + "epoch": 1.3969732246798603, + "grad_norm": 0.3576236963272095, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 1800 + }, + { + "epoch": 1.404734187039193, + "grad_norm": 0.3586704432964325, + "learning_rate": 0.0002, + "loss": 1.8516, + "step": 1810 + }, + { + "epoch": 1.4124951493985254, + "grad_norm": 0.3943439722061157, + "learning_rate": 0.0002, + "loss": 1.6171, + "step": 1820 + }, + { + "epoch": 1.420256111757858, + "grad_norm": 0.3484877049922943, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 1830 + }, + { + "epoch": 1.4280170741171905, + "grad_norm": 0.3344518840312958, + "learning_rate": 0.0002, + "loss": 1.7205, + "step": 1840 + }, + { + "epoch": 1.4357780364765231, + "grad_norm": 0.4345698356628418, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1850 + }, + { + "epoch": 1.4435389988358556, + "grad_norm": 0.5525162220001221, + "learning_rate": 0.0002, + "loss": 1.6855, + "step": 1860 + }, + { + "epoch": 1.4512999611951882, + "grad_norm": 0.37194496393203735, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1870 + }, + { + "epoch": 1.4590609235545209, + "grad_norm": 0.34570157527923584, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1880 + }, + { + "epoch": 1.4668218859138533, + "grad_norm": 0.3512282073497772, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 1890 + }, + { + "epoch": 1.4745828482731858, + "grad_norm": 0.3443922996520996, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1900 + }, + { + "epoch": 1.4823438106325184, + "grad_norm": 0.3812018036842346, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 1910 + }, + { + "epoch": 1.490104772991851, + "grad_norm": 0.39263492822647095, + "learning_rate": 0.0002, + "loss": 1.7277, + "step": 1920 + }, + { + "epoch": 1.4978657353511835, + "grad_norm": 0.3146156072616577, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1930 + }, + { + "epoch": 1.505626697710516, + "grad_norm": 0.3653988540172577, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 1940 + }, + { + "epoch": 1.5133876600698488, + "grad_norm": 0.3966596722602844, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 1950 + }, + { + "epoch": 1.5211486224291813, + "grad_norm": 0.3441697359085083, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1960 + }, + { + "epoch": 1.5289095847885137, + "grad_norm": 0.3328564465045929, + "learning_rate": 0.0002, + "loss": 1.7175, + "step": 1970 + }, + { + "epoch": 1.5366705471478463, + "grad_norm": 0.34068772196769714, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 1980 + }, + { + "epoch": 1.544431509507179, + "grad_norm": 0.3559795916080475, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 1990 + }, + { + "epoch": 1.5521924718665114, + "grad_norm": 0.37888768315315247, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 2000 + }, + { + "epoch": 1.5599534342258439, + "grad_norm": 0.36128363013267517, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 2010 + }, + { + "epoch": 1.5677143965851765, + "grad_norm": 0.3643714487552643, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 2020 + }, + { + "epoch": 1.5754753589445092, + "grad_norm": 0.3863612115383148, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 2030 + }, + { + "epoch": 1.5832363213038416, + "grad_norm": 0.32831457257270813, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 2040 + }, + { + "epoch": 1.5909972836631743, + "grad_norm": 0.36098113656044006, + "learning_rate": 0.0002, + "loss": 1.7404, + "step": 2050 + }, + { + "epoch": 1.598758246022507, + "grad_norm": 1.1079334020614624, + "learning_rate": 0.0002, + "loss": 1.7065, + "step": 2060 + }, + { + "epoch": 1.6065192083818394, + "grad_norm": 0.35615381598472595, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2070 + }, + { + "epoch": 1.6142801707411718, + "grad_norm": 0.369711309671402, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 2080 + }, + { + "epoch": 1.6220411331005045, + "grad_norm": 0.390658438205719, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 2090 + }, + { + "epoch": 1.6298020954598371, + "grad_norm": 0.3422999382019043, + "learning_rate": 0.0002, + "loss": 1.6996, + "step": 2100 + }, + { + "epoch": 1.6375630578191696, + "grad_norm": 0.372475266456604, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 2110 + }, + { + "epoch": 1.645324020178502, + "grad_norm": 0.35660576820373535, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 2120 + }, + { + "epoch": 1.6530849825378346, + "grad_norm": 0.35754942893981934, + "learning_rate": 0.0002, + "loss": 1.6991, + "step": 2130 + }, + { + "epoch": 1.6608459448971673, + "grad_norm": 0.34572410583496094, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 2140 + }, + { + "epoch": 1.6686069072564997, + "grad_norm": 0.42059701681137085, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 2150 + }, + { + "epoch": 1.6763678696158324, + "grad_norm": 0.35200759768486023, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 2160 + }, + { + "epoch": 1.684128831975165, + "grad_norm": 0.3704029321670532, + "learning_rate": 0.0002, + "loss": 1.6869, + "step": 2170 + }, + { + "epoch": 1.6918897943344975, + "grad_norm": 0.40450501441955566, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 2180 + }, + { + "epoch": 1.69965075669383, + "grad_norm": 0.362966924905777, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2190 + }, + { + "epoch": 1.7074117190531626, + "grad_norm": 0.36586204171180725, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 2200 + }, + { + "epoch": 1.7151726814124952, + "grad_norm": 0.3295372426509857, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 2210 + }, + { + "epoch": 1.7229336437718277, + "grad_norm": 0.3892575800418854, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 2220 + }, + { + "epoch": 1.73069460613116, + "grad_norm": 0.34712135791778564, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 2230 + }, + { + "epoch": 1.738455568490493, + "grad_norm": 0.34801796078681946, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 2240 + }, + { + "epoch": 1.7462165308498254, + "grad_norm": 0.3822397291660309, + "learning_rate": 0.0002, + "loss": 1.7009, + "step": 2250 + }, + { + "epoch": 1.7539774932091579, + "grad_norm": 0.38933250308036804, + "learning_rate": 0.0002, + "loss": 1.6546, + "step": 2260 + }, + { + "epoch": 1.7617384555684905, + "grad_norm": 0.3798373341560364, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 2270 + }, + { + "epoch": 1.7694994179278232, + "grad_norm": 0.35151317715644836, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 2280 + }, + { + "epoch": 1.7772603802871556, + "grad_norm": 0.44981494545936584, + "learning_rate": 0.0002, + "loss": 1.6894, + "step": 2290 + }, + { + "epoch": 1.785021342646488, + "grad_norm": 0.3992624580860138, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 2300 + }, + { + "epoch": 1.7927823050058207, + "grad_norm": 0.3772512376308441, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 2310 + }, + { + "epoch": 1.8005432673651534, + "grad_norm": 0.3511589467525482, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2320 + }, + { + "epoch": 1.8083042297244858, + "grad_norm": 0.3805285394191742, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 2330 + }, + { + "epoch": 1.8160651920838184, + "grad_norm": 0.3792071044445038, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 2340 + }, + { + "epoch": 1.823826154443151, + "grad_norm": 0.36430829763412476, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 2350 + }, + { + "epoch": 1.8315871168024835, + "grad_norm": 0.36502477526664734, + "learning_rate": 0.0002, + "loss": 1.6773, + "step": 2360 + }, + { + "epoch": 1.839348079161816, + "grad_norm": 0.35015153884887695, + "learning_rate": 0.0002, + "loss": 1.8072, + "step": 2370 + }, + { + "epoch": 1.8471090415211486, + "grad_norm": 0.3710903823375702, + "learning_rate": 0.0002, + "loss": 1.7734, + "step": 2380 + }, + { + "epoch": 1.8548700038804813, + "grad_norm": 0.3542828857898712, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 2390 + }, + { + "epoch": 1.8626309662398137, + "grad_norm": 0.35467568039894104, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 2400 + }, + { + "epoch": 1.8703919285991462, + "grad_norm": 0.3638560473918915, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 2410 + }, + { + "epoch": 1.8781528909584788, + "grad_norm": 0.3823298215866089, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 2420 + }, + { + "epoch": 1.8859138533178115, + "grad_norm": 0.3926416337490082, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 2430 + }, + { + "epoch": 1.893674815677144, + "grad_norm": 0.3608079254627228, + "learning_rate": 0.0002, + "loss": 1.71, + "step": 2440 + }, + { + "epoch": 1.9014357780364766, + "grad_norm": 0.3426613509654999, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 2450 + }, + { + "epoch": 1.9091967403958092, + "grad_norm": 0.3522338569164276, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 2460 + }, + { + "epoch": 1.9169577027551417, + "grad_norm": 0.3608049154281616, + "learning_rate": 0.0002, + "loss": 1.7307, + "step": 2470 + }, + { + "epoch": 1.924718665114474, + "grad_norm": 0.3849755525588989, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 2480 + }, + { + "epoch": 1.9324796274738067, + "grad_norm": 0.4154011011123657, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 2490 + }, + { + "epoch": 1.9402405898331394, + "grad_norm": 0.3602796792984009, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 2500 + }, + { + "epoch": 1.9480015521924718, + "grad_norm": 0.3702992796897888, + "learning_rate": 0.0002, + "loss": 1.7843, + "step": 2510 + }, + { + "epoch": 1.9557625145518043, + "grad_norm": 0.3657735288143158, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 2520 + }, + { + "epoch": 1.963523476911137, + "grad_norm": 0.41031739115715027, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2530 + }, + { + "epoch": 1.9712844392704696, + "grad_norm": 0.34578680992126465, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 2540 + }, + { + "epoch": 1.979045401629802, + "grad_norm": 0.3361521065235138, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 2550 + }, + { + "epoch": 1.9868063639891347, + "grad_norm": 0.34342363476753235, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 2560 + }, + { + "epoch": 1.9945673263484673, + "grad_norm": 0.32954007387161255, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 2570 + }, + { + "epoch": 2.0, + "eval_loss": 1.8068748712539673, + "eval_runtime": 105.5885, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 2577 + }, + { + "epoch": 2.0023282887077998, + "grad_norm": 0.336302250623703, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 2580 + }, + { + "epoch": 2.010089251067132, + "grad_norm": 0.3627048432826996, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 2590 + }, + { + "epoch": 2.017850213426465, + "grad_norm": 0.38406702876091003, + "learning_rate": 0.0002, + "loss": 1.4908, + "step": 2600 + }, + { + "epoch": 2.0256111757857975, + "grad_norm": 0.5326781272888184, + "learning_rate": 0.0002, + "loss": 1.5368, + "step": 2610 + }, + { + "epoch": 2.03337213814513, + "grad_norm": 0.4774554967880249, + "learning_rate": 0.0002, + "loss": 1.5727, + "step": 2620 + }, + { + "epoch": 2.0411331005044624, + "grad_norm": 0.4251810312271118, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 2630 + }, + { + "epoch": 2.0488940628637953, + "grad_norm": 0.4693007171154022, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 2640 + }, + { + "epoch": 2.0566550252231277, + "grad_norm": 0.46371519565582275, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 2650 + }, + { + "epoch": 2.06441598758246, + "grad_norm": 0.46652570366859436, + "learning_rate": 0.0002, + "loss": 1.6304, + "step": 2660 + }, + { + "epoch": 2.0721769499417926, + "grad_norm": 0.45200315117836, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 2670 + }, + { + "epoch": 2.0799379123011255, + "grad_norm": 0.42905205488204956, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 2680 + }, + { + "epoch": 2.087698874660458, + "grad_norm": 0.44509148597717285, + "learning_rate": 0.0002, + "loss": 1.5401, + "step": 2690 + }, + { + "epoch": 2.0954598370197903, + "grad_norm": 0.4445319175720215, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 2700 + }, + { + "epoch": 2.103220799379123, + "grad_norm": 0.46825504302978516, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 2710 + }, + { + "epoch": 2.1109817617384556, + "grad_norm": 0.4623856842517853, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 2720 + }, + { + "epoch": 2.118742724097788, + "grad_norm": 0.4833452105522156, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2730 + }, + { + "epoch": 2.1265036864571205, + "grad_norm": 0.4582686722278595, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 2740 + }, + { + "epoch": 2.1342646488164534, + "grad_norm": 0.47587934136390686, + "learning_rate": 0.0002, + "loss": 1.5801, + "step": 2750 + }, + { + "epoch": 2.142025611175786, + "grad_norm": 0.4602217972278595, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 2760 + }, + { + "epoch": 2.1497865735351183, + "grad_norm": 0.47501352429389954, + "learning_rate": 0.0002, + "loss": 1.5271, + "step": 2770 + }, + { + "epoch": 2.1575475358944507, + "grad_norm": 0.5078499913215637, + "learning_rate": 0.0002, + "loss": 1.4862, + "step": 2780 + }, + { + "epoch": 2.1653084982537836, + "grad_norm": 0.497704416513443, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 2790 + }, + { + "epoch": 2.173069460613116, + "grad_norm": 0.5435971617698669, + "learning_rate": 0.0002, + "loss": 1.5597, + "step": 2800 + }, + { + "epoch": 2.1808304229724484, + "grad_norm": 0.5172356367111206, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 2810 + }, + { + "epoch": 2.1885913853317813, + "grad_norm": 0.44063422083854675, + "learning_rate": 0.0002, + "loss": 1.5202, + "step": 2820 + }, + { + "epoch": 2.1963523476911138, + "grad_norm": 0.5079569220542908, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 2830 + }, + { + "epoch": 2.204113310050446, + "grad_norm": 0.45658132433891296, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2840 + }, + { + "epoch": 2.2118742724097786, + "grad_norm": 0.5103023648262024, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 2850 + }, + { + "epoch": 2.2196352347691115, + "grad_norm": 0.4882226288318634, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 2860 + }, + { + "epoch": 2.227396197128444, + "grad_norm": 0.5087296962738037, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 2870 + }, + { + "epoch": 2.2351571594877764, + "grad_norm": 0.45293712615966797, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 2880 + }, + { + "epoch": 2.242918121847109, + "grad_norm": 0.5120379328727722, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 2890 + }, + { + "epoch": 2.2506790842064417, + "grad_norm": 0.47126415371894836, + "learning_rate": 0.0002, + "loss": 1.5273, + "step": 2900 + }, + { + "epoch": 2.258440046565774, + "grad_norm": 0.44005846977233887, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 2910 + }, + { + "epoch": 2.2662010089251066, + "grad_norm": 0.46476176381111145, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 2920 + }, + { + "epoch": 2.2739619712844394, + "grad_norm": 0.48051515221595764, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 2930 + }, + { + "epoch": 2.281722933643772, + "grad_norm": 0.480069637298584, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2940 + }, + { + "epoch": 2.2894838960031043, + "grad_norm": 0.5122102499008179, + "learning_rate": 0.0002, + "loss": 1.5747, + "step": 2950 + }, + { + "epoch": 2.2972448583624367, + "grad_norm": 0.48879891633987427, + "learning_rate": 0.0002, + "loss": 1.5183, + "step": 2960 + }, + { + "epoch": 2.3050058207217696, + "grad_norm": 0.4973136782646179, + "learning_rate": 0.0002, + "loss": 1.5483, + "step": 2970 + }, + { + "epoch": 2.312766783081102, + "grad_norm": 0.5522695183753967, + "learning_rate": 0.0002, + "loss": 1.677, + "step": 2980 + }, + { + "epoch": 2.3205277454404345, + "grad_norm": 0.5220217704772949, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 2990 + }, + { + "epoch": 2.328288707799767, + "grad_norm": 0.4978662431240082, + "learning_rate": 0.0002, + "loss": 1.6299, + "step": 3000 + }, + { + "epoch": 2.3360496701591, + "grad_norm": 0.554053544998169, + "learning_rate": 0.0002, + "loss": 1.5498, + "step": 3010 + }, + { + "epoch": 2.3438106325184322, + "grad_norm": 0.4703886806964874, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 3020 + }, + { + "epoch": 2.3515715948777647, + "grad_norm": 0.5074123740196228, + "learning_rate": 0.0002, + "loss": 1.5418, + "step": 3030 + }, + { + "epoch": 2.3593325572370976, + "grad_norm": 0.5088278651237488, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 3040 + }, + { + "epoch": 2.36709351959643, + "grad_norm": 0.4752114415168762, + "learning_rate": 0.0002, + "loss": 1.5249, + "step": 3050 + }, + { + "epoch": 2.3748544819557624, + "grad_norm": 0.5121659636497498, + "learning_rate": 0.0002, + "loss": 1.5353, + "step": 3060 + }, + { + "epoch": 2.3826154443150953, + "grad_norm": 0.48649218678474426, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 3070 + }, + { + "epoch": 2.3903764066744277, + "grad_norm": 0.5209488868713379, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 3080 + }, + { + "epoch": 2.39813736903376, + "grad_norm": 0.5110517740249634, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 3090 + }, + { + "epoch": 2.4058983313930926, + "grad_norm": 0.5609337091445923, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3100 + }, + { + "epoch": 2.4136592937524255, + "grad_norm": 0.5191826224327087, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 3110 + }, + { + "epoch": 2.421420256111758, + "grad_norm": 0.4876069724559784, + "learning_rate": 0.0002, + "loss": 1.6347, + "step": 3120 + }, + { + "epoch": 2.4291812184710904, + "grad_norm": 0.4713933765888214, + "learning_rate": 0.0002, + "loss": 1.5565, + "step": 3130 + }, + { + "epoch": 2.436942180830423, + "grad_norm": 0.5102227330207825, + "learning_rate": 0.0002, + "loss": 1.6388, + "step": 3140 + }, + { + "epoch": 2.4447031431897557, + "grad_norm": 0.44546666741371155, + "learning_rate": 0.0002, + "loss": 1.5667, + "step": 3150 + }, + { + "epoch": 2.452464105549088, + "grad_norm": 0.5167558193206787, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3160 + }, + { + "epoch": 2.4602250679084205, + "grad_norm": 0.5226958990097046, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 3170 + }, + { + "epoch": 2.4679860302677534, + "grad_norm": 0.4751799702644348, + "learning_rate": 0.0002, + "loss": 1.5758, + "step": 3180 + }, + { + "epoch": 2.475746992627086, + "grad_norm": 0.4744729697704315, + "learning_rate": 0.0002, + "loss": 1.6234, + "step": 3190 + }, + { + "epoch": 2.4835079549864183, + "grad_norm": 0.5203230381011963, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 3200 + }, + { + "epoch": 2.4912689173457507, + "grad_norm": 0.47209781408309937, + "learning_rate": 0.0002, + "loss": 1.493, + "step": 3210 + }, + { + "epoch": 2.4990298797050836, + "grad_norm": 0.5241674780845642, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3220 + }, + { + "epoch": 2.506790842064416, + "grad_norm": 0.5152244567871094, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 3230 + }, + { + "epoch": 2.5145518044237485, + "grad_norm": 0.5216741561889648, + "learning_rate": 0.0002, + "loss": 1.6248, + "step": 3240 + }, + { + "epoch": 2.522312766783081, + "grad_norm": 0.4953259527683258, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 3250 + }, + { + "epoch": 2.530073729142414, + "grad_norm": 0.5973829030990601, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 3260 + }, + { + "epoch": 2.5378346915017462, + "grad_norm": 0.48804202675819397, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 3270 + }, + { + "epoch": 2.5455956538610787, + "grad_norm": 0.5334644317626953, + "learning_rate": 0.0002, + "loss": 1.4954, + "step": 3280 + }, + { + "epoch": 2.5533566162204115, + "grad_norm": 0.46873313188552856, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 3290 + }, + { + "epoch": 2.561117578579744, + "grad_norm": 0.4282589554786682, + "learning_rate": 0.0002, + "loss": 1.5362, + "step": 3300 + }, + { + "epoch": 2.5688785409390764, + "grad_norm": 0.4848293960094452, + "learning_rate": 0.0002, + "loss": 1.6278, + "step": 3310 + }, + { + "epoch": 2.576639503298409, + "grad_norm": 0.5093745589256287, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 3320 + }, + { + "epoch": 2.5844004656577413, + "grad_norm": 0.5084842443466187, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 3330 + }, + { + "epoch": 2.592161428017074, + "grad_norm": 0.4696281850337982, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 3340 + }, + { + "epoch": 2.5999223903764066, + "grad_norm": 0.5767765641212463, + "learning_rate": 0.0002, + "loss": 1.5359, + "step": 3350 + }, + { + "epoch": 2.607683352735739, + "grad_norm": 0.47300875186920166, + "learning_rate": 0.0002, + "loss": 1.6097, + "step": 3360 + }, + { + "epoch": 2.615444315095072, + "grad_norm": 0.4809158146381378, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 3370 + }, + { + "epoch": 2.6232052774544043, + "grad_norm": 0.5141063928604126, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 3380 + }, + { + "epoch": 2.630966239813737, + "grad_norm": 0.4832935035228729, + "learning_rate": 0.0002, + "loss": 1.5784, + "step": 3390 + }, + { + "epoch": 2.6387272021730697, + "grad_norm": 0.5044625401496887, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 3400 + }, + { + "epoch": 2.646488164532402, + "grad_norm": 0.5287680625915527, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 3410 + }, + { + "epoch": 2.6542491268917345, + "grad_norm": 0.5306379795074463, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 3420 + }, + { + "epoch": 2.662010089251067, + "grad_norm": 0.5849291682243347, + "learning_rate": 0.0002, + "loss": 1.5264, + "step": 3430 + }, + { + "epoch": 2.6697710516104, + "grad_norm": 0.7951080799102783, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 3440 + }, + { + "epoch": 2.6775320139697323, + "grad_norm": 0.48087653517723083, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 3450 + }, + { + "epoch": 2.6852929763290647, + "grad_norm": 0.5396431684494019, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 3460 + }, + { + "epoch": 2.693053938688397, + "grad_norm": 0.5481634736061096, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 3470 + }, + { + "epoch": 2.70081490104773, + "grad_norm": 0.5068731307983398, + "learning_rate": 0.0002, + "loss": 1.6436, + "step": 3480 + }, + { + "epoch": 2.7085758634070625, + "grad_norm": 0.5759826898574829, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 3490 + }, + { + "epoch": 2.716336825766395, + "grad_norm": 0.7253932952880859, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 3500 + }, + { + "epoch": 2.724097788125728, + "grad_norm": 0.527745246887207, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 3510 + }, + { + "epoch": 2.73185875048506, + "grad_norm": 0.5279242396354675, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 3520 + }, + { + "epoch": 2.7396197128443927, + "grad_norm": 0.5047839283943176, + "learning_rate": 0.0002, + "loss": 1.6768, + "step": 3530 + }, + { + "epoch": 2.7473806752037255, + "grad_norm": 0.5430883169174194, + "learning_rate": 0.0002, + "loss": 1.5517, + "step": 3540 + }, + { + "epoch": 2.755141637563058, + "grad_norm": 0.4496723711490631, + "learning_rate": 0.0002, + "loss": 1.5624, + "step": 3550 + }, + { + "epoch": 2.7629025999223904, + "grad_norm": 0.5063338875770569, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 3560 + }, + { + "epoch": 2.770663562281723, + "grad_norm": 0.4619026780128479, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 3570 + }, + { + "epoch": 2.7784245246410553, + "grad_norm": 0.4753304123878479, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 3580 + }, + { + "epoch": 2.786185487000388, + "grad_norm": 0.5422708988189697, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 3590 + }, + { + "epoch": 2.7939464493597206, + "grad_norm": 0.4756578803062439, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 3600 + }, + { + "epoch": 2.801707411719053, + "grad_norm": 0.5057567358016968, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 3610 + }, + { + "epoch": 2.809468374078386, + "grad_norm": 0.5410919785499573, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 3620 + }, + { + "epoch": 2.8172293364377183, + "grad_norm": 0.4958136975765228, + "learning_rate": 0.0002, + "loss": 1.5573, + "step": 3630 + }, + { + "epoch": 2.8249902987970508, + "grad_norm": 0.454527348279953, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 3640 + }, + { + "epoch": 2.8327512611563836, + "grad_norm": 0.5092706084251404, + "learning_rate": 0.0002, + "loss": 1.5582, + "step": 3650 + }, + { + "epoch": 2.840512223515716, + "grad_norm": 0.5314022302627563, + "learning_rate": 0.0002, + "loss": 1.5893, + "step": 3660 + }, + { + "epoch": 2.8482731858750485, + "grad_norm": 0.5028239488601685, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 3670 + }, + { + "epoch": 2.856034148234381, + "grad_norm": 0.5127444863319397, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 3680 + }, + { + "epoch": 2.8637951105937134, + "grad_norm": 0.5045645236968994, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 3690 + }, + { + "epoch": 2.8715560729530463, + "grad_norm": 0.5560781955718994, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 3700 + }, + { + "epoch": 2.8793170353123787, + "grad_norm": 0.5177600383758545, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 3710 + }, + { + "epoch": 2.887077997671711, + "grad_norm": 0.45830899477005005, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 3720 + }, + { + "epoch": 2.894838960031044, + "grad_norm": 0.4828629195690155, + "learning_rate": 0.0002, + "loss": 1.6344, + "step": 3730 + }, + { + "epoch": 2.9025999223903765, + "grad_norm": 0.48241183161735535, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 3740 + }, + { + "epoch": 2.910360884749709, + "grad_norm": 0.4909592568874359, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 3750 + }, + { + "epoch": 2.9181218471090418, + "grad_norm": 0.44677025079727173, + "learning_rate": 0.0002, + "loss": 1.4927, + "step": 3760 + }, + { + "epoch": 2.925882809468374, + "grad_norm": 0.4928834140300751, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 3770 + }, + { + "epoch": 2.9336437718277066, + "grad_norm": 0.5673553347587585, + "learning_rate": 0.0002, + "loss": 1.5843, + "step": 3780 + }, + { + "epoch": 2.941404734187039, + "grad_norm": 0.548190712928772, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 3790 + }, + { + "epoch": 2.9491656965463715, + "grad_norm": 0.48979803919792175, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 3800 + }, + { + "epoch": 2.9569266589057044, + "grad_norm": 0.533191978931427, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 3810 + }, + { + "epoch": 2.964687621265037, + "grad_norm": 0.5362946391105652, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 3820 + }, + { + "epoch": 2.9724485836243693, + "grad_norm": 0.4724906384944916, + "learning_rate": 0.0002, + "loss": 1.6602, + "step": 3830 + }, + { + "epoch": 2.980209545983702, + "grad_norm": 0.5468461513519287, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 3840 + }, + { + "epoch": 2.9879705083430346, + "grad_norm": 0.4697108864784241, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 3850 + }, + { + "epoch": 2.995731470702367, + "grad_norm": 0.4780906140804291, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 3860 + }, + { + "epoch": 2.9996119518820334, + "eval_loss": 1.8472607135772705, + "eval_runtime": 106.5541, + "eval_samples_per_second": 4.758, + "eval_steps_per_second": 0.601, + "step": 3865 + } + ], + "logging_steps": 10, + "max_steps": 10304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9854655872565248e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8ca6e2c3ac58fa2af9f99747566f932f41a5a4d5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-3865/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f7feb06ff53d5bf79374054a25b662309e705a2ca08dfa3b0bce7b8b4632fae +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fcf639104461207a045d5d09886005635fa03687 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcc71afcf27414c77213ed367ce97289db2b15abf576c01b46ddfd036010587e +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddcf3fe5f911ac00a4ba3b14a6dc5f361a40aeb9 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6bbf02e1091f7198aaf856f46cbc13189efaa221eb7f4ebd80267834cafaa91 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6c947ffdd2513b180e20aaa51ecd959f7d3624ae --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a9146ec642d2e9f5ee7472f453c10c017aa04a0cdfeede141c07559c9843082 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ba216a635b5168dc97e9028e9895f5527ec74cf --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56694862b165f53e8e924e92cff89c060ce3f55cea71f7162761fb9876c8b672 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c7c8530012effdcba26024d9583116b824efecb5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/trainer_state.json @@ -0,0 +1,3670 @@ +{ + "best_metric": 1.8068748712539673, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 5154, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007760962359332557, + "grad_norm": 1.0751162767410278, + "learning_rate": 0.0002, + "loss": 3.0855, + "step": 10 + }, + { + "epoch": 0.015521924718665115, + "grad_norm": 0.4697345793247223, + "learning_rate": 0.0002, + "loss": 2.4744, + "step": 20 + }, + { + "epoch": 0.023282887077997673, + "grad_norm": 0.5370839238166809, + "learning_rate": 0.0002, + "loss": 2.193, + "step": 30 + }, + { + "epoch": 0.03104384943733023, + "grad_norm": 0.46794816851615906, + "learning_rate": 0.0002, + "loss": 2.0599, + "step": 40 + }, + { + "epoch": 0.038804811796662786, + "grad_norm": 0.44624820351600647, + "learning_rate": 0.0002, + "loss": 1.9354, + "step": 50 + }, + { + "epoch": 0.046565774155995346, + "grad_norm": 0.3953201472759247, + "learning_rate": 0.0002, + "loss": 1.9319, + "step": 60 + }, + { + "epoch": 0.0543267365153279, + "grad_norm": 0.3935912549495697, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 70 + }, + { + "epoch": 0.06208769887466046, + "grad_norm": 0.4520699381828308, + "learning_rate": 0.0002, + "loss": 1.8795, + "step": 80 + }, + { + "epoch": 0.06984866123399301, + "grad_norm": 0.3801847994327545, + "learning_rate": 0.0002, + "loss": 1.8354, + "step": 90 + }, + { + "epoch": 0.07760962359332557, + "grad_norm": 0.4020165205001831, + "learning_rate": 0.0002, + "loss": 1.9053, + "step": 100 + }, + { + "epoch": 0.08537058595265813, + "grad_norm": 0.3860672116279602, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 110 + }, + { + "epoch": 0.09313154831199069, + "grad_norm": 0.3681113123893738, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 120 + }, + { + "epoch": 0.10089251067132324, + "grad_norm": 0.3594866991043091, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 130 + }, + { + "epoch": 0.1086534730306558, + "grad_norm": 0.3879193663597107, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 140 + }, + { + "epoch": 0.11641443538998836, + "grad_norm": 0.3270505666732788, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 150 + }, + { + "epoch": 0.12417539774932092, + "grad_norm": 0.36824458837509155, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 160 + }, + { + "epoch": 0.13193636010865348, + "grad_norm": 0.383882075548172, + "learning_rate": 0.0002, + "loss": 1.8305, + "step": 170 + }, + { + "epoch": 0.13969732246798602, + "grad_norm": 0.3368665874004364, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 180 + }, + { + "epoch": 0.1474582848273186, + "grad_norm": 0.35961097478866577, + "learning_rate": 0.0002, + "loss": 1.7882, + "step": 190 + }, + { + "epoch": 0.15521924718665114, + "grad_norm": 0.3415963351726532, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 200 + }, + { + "epoch": 0.1629802095459837, + "grad_norm": 0.4100632071495056, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 210 + }, + { + "epoch": 0.17074117190531626, + "grad_norm": 0.3516307473182678, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 220 + }, + { + "epoch": 0.1785021342646488, + "grad_norm": 0.37919050455093384, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 230 + }, + { + "epoch": 0.18626309662398138, + "grad_norm": 0.33270683884620667, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 240 + }, + { + "epoch": 0.19402405898331393, + "grad_norm": 0.3348783254623413, + "learning_rate": 0.0002, + "loss": 1.7925, + "step": 250 + }, + { + "epoch": 0.20178502134264648, + "grad_norm": 0.3888475298881531, + "learning_rate": 0.0002, + "loss": 1.7774, + "step": 260 + }, + { + "epoch": 0.20954598370197905, + "grad_norm": 0.3554602861404419, + "learning_rate": 0.0002, + "loss": 1.8381, + "step": 270 + }, + { + "epoch": 0.2173069460613116, + "grad_norm": 0.33277708292007446, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 280 + }, + { + "epoch": 0.22506790842064417, + "grad_norm": 0.3281584680080414, + "learning_rate": 0.0002, + "loss": 1.7713, + "step": 290 + }, + { + "epoch": 0.23282887077997672, + "grad_norm": 0.3185969591140747, + "learning_rate": 0.0002, + "loss": 1.8181, + "step": 300 + }, + { + "epoch": 0.24058983313930926, + "grad_norm": 0.35335442423820496, + "learning_rate": 0.0002, + "loss": 1.8595, + "step": 310 + }, + { + "epoch": 0.24835079549864184, + "grad_norm": 0.3119595944881439, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 320 + }, + { + "epoch": 0.2561117578579744, + "grad_norm": 0.36424458026885986, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 330 + }, + { + "epoch": 0.26387272021730696, + "grad_norm": 0.3618951141834259, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 340 + }, + { + "epoch": 0.2716336825766395, + "grad_norm": 0.312757670879364, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 350 + }, + { + "epoch": 0.27939464493597205, + "grad_norm": 0.326016366481781, + "learning_rate": 0.0002, + "loss": 1.9031, + "step": 360 + }, + { + "epoch": 0.2871556072953046, + "grad_norm": 0.34093883633613586, + "learning_rate": 0.0002, + "loss": 1.8214, + "step": 370 + }, + { + "epoch": 0.2949165696546372, + "grad_norm": 0.32325029373168945, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 380 + }, + { + "epoch": 0.30267753201396974, + "grad_norm": 0.34105437994003296, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 390 + }, + { + "epoch": 0.3104384943733023, + "grad_norm": 0.32565295696258545, + "learning_rate": 0.0002, + "loss": 1.7926, + "step": 400 + }, + { + "epoch": 0.31819945673263483, + "grad_norm": 0.32742050290107727, + "learning_rate": 0.0002, + "loss": 1.8031, + "step": 410 + }, + { + "epoch": 0.3259604190919674, + "grad_norm": 0.30233046412467957, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 420 + }, + { + "epoch": 0.3337213814513, + "grad_norm": 0.32419222593307495, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 430 + }, + { + "epoch": 0.3414823438106325, + "grad_norm": 0.3653007745742798, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 440 + }, + { + "epoch": 0.3492433061699651, + "grad_norm": 0.31617099046707153, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 450 + }, + { + "epoch": 0.3570042685292976, + "grad_norm": 0.3305962085723877, + "learning_rate": 0.0002, + "loss": 1.7677, + "step": 460 + }, + { + "epoch": 0.36476523088863017, + "grad_norm": 0.3178933262825012, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 470 + }, + { + "epoch": 0.37252619324796277, + "grad_norm": 0.37163782119750977, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 480 + }, + { + "epoch": 0.3802871556072953, + "grad_norm": 0.469844788312912, + "learning_rate": 0.0002, + "loss": 1.8804, + "step": 490 + }, + { + "epoch": 0.38804811796662786, + "grad_norm": 0.3409338593482971, + "learning_rate": 0.0002, + "loss": 1.8343, + "step": 500 + }, + { + "epoch": 0.3958090803259604, + "grad_norm": 0.31943467259407043, + "learning_rate": 0.0002, + "loss": 1.8433, + "step": 510 + }, + { + "epoch": 0.40357004268529295, + "grad_norm": 0.32293614745140076, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 520 + }, + { + "epoch": 0.41133100504462555, + "grad_norm": 0.2994382977485657, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 530 + }, + { + "epoch": 0.4190919674039581, + "grad_norm": 0.3273141384124756, + "learning_rate": 0.0002, + "loss": 1.8153, + "step": 540 + }, + { + "epoch": 0.42685292976329064, + "grad_norm": 0.3020550012588501, + "learning_rate": 0.0002, + "loss": 1.8097, + "step": 550 + }, + { + "epoch": 0.4346138921226232, + "grad_norm": 0.30113112926483154, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 560 + }, + { + "epoch": 0.44237485448195574, + "grad_norm": 0.30274903774261475, + "learning_rate": 0.0002, + "loss": 1.8084, + "step": 570 + }, + { + "epoch": 0.45013581684128834, + "grad_norm": 0.3231128454208374, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 580 + }, + { + "epoch": 0.4578967792006209, + "grad_norm": 0.3255121409893036, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 590 + }, + { + "epoch": 0.46565774155995343, + "grad_norm": 0.30147507786750793, + "learning_rate": 0.0002, + "loss": 1.8227, + "step": 600 + }, + { + "epoch": 0.473418703919286, + "grad_norm": 0.29781386256217957, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 610 + }, + { + "epoch": 0.4811796662786185, + "grad_norm": 0.30914685130119324, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 620 + }, + { + "epoch": 0.4889406286379511, + "grad_norm": 0.3110593855381012, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 630 + }, + { + "epoch": 0.49670159099728367, + "grad_norm": 0.3298132121562958, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 640 + }, + { + "epoch": 0.5044625533566163, + "grad_norm": 0.322122186422348, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 650 + }, + { + "epoch": 0.5122235157159488, + "grad_norm": 0.3504371643066406, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 660 + }, + { + "epoch": 0.5199844780752814, + "grad_norm": 0.3102182149887085, + "learning_rate": 0.0002, + "loss": 1.8682, + "step": 670 + }, + { + "epoch": 0.5277454404346139, + "grad_norm": 0.6113658547401428, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 680 + }, + { + "epoch": 0.5355064027939465, + "grad_norm": 0.31841862201690674, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 690 + }, + { + "epoch": 0.543267365153279, + "grad_norm": 0.2830526530742645, + "learning_rate": 0.0002, + "loss": 1.7587, + "step": 700 + }, + { + "epoch": 0.5510283275126115, + "grad_norm": 0.3048769533634186, + "learning_rate": 0.0002, + "loss": 1.7887, + "step": 710 + }, + { + "epoch": 0.5587892898719441, + "grad_norm": 0.2719033658504486, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 720 + }, + { + "epoch": 0.5665502522312766, + "grad_norm": 0.3176722526550293, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 730 + }, + { + "epoch": 0.5743112145906092, + "grad_norm": 0.32491734623908997, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 740 + }, + { + "epoch": 0.5820721769499418, + "grad_norm": 0.32746851444244385, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 750 + }, + { + "epoch": 0.5898331393092744, + "grad_norm": 0.3055773973464966, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 760 + }, + { + "epoch": 0.5975941016686069, + "grad_norm": 0.30671584606170654, + "learning_rate": 0.0002, + "loss": 1.8597, + "step": 770 + }, + { + "epoch": 0.6053550640279395, + "grad_norm": 0.28770264983177185, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 780 + }, + { + "epoch": 0.613116026387272, + "grad_norm": 0.2814285457134247, + "learning_rate": 0.0002, + "loss": 1.7025, + "step": 790 + }, + { + "epoch": 0.6208769887466046, + "grad_norm": 0.31554412841796875, + "learning_rate": 0.0002, + "loss": 1.819, + "step": 800 + }, + { + "epoch": 0.6286379511059371, + "grad_norm": 0.2984226942062378, + "learning_rate": 0.0002, + "loss": 1.8335, + "step": 810 + }, + { + "epoch": 0.6363989134652697, + "grad_norm": 0.2859906554222107, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 820 + }, + { + "epoch": 0.6441598758246022, + "grad_norm": 0.2887928783893585, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 830 + }, + { + "epoch": 0.6519208381839348, + "grad_norm": 0.31287339329719543, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 840 + }, + { + "epoch": 0.6596818005432674, + "grad_norm": 0.32064181566238403, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 850 + }, + { + "epoch": 0.6674427629026, + "grad_norm": 0.290981650352478, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 860 + }, + { + "epoch": 0.6752037252619325, + "grad_norm": 0.33060121536254883, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 870 + }, + { + "epoch": 0.682964687621265, + "grad_norm": 0.27032899856567383, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 880 + }, + { + "epoch": 0.6907256499805976, + "grad_norm": 0.29031234979629517, + "learning_rate": 0.0002, + "loss": 1.8423, + "step": 890 + }, + { + "epoch": 0.6984866123399301, + "grad_norm": 0.2845142185688019, + "learning_rate": 0.0002, + "loss": 1.835, + "step": 900 + }, + { + "epoch": 0.7062475746992627, + "grad_norm": 0.8638312816619873, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 910 + }, + { + "epoch": 0.7140085370585952, + "grad_norm": 0.3086668848991394, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 920 + }, + { + "epoch": 0.7217694994179278, + "grad_norm": 0.2724177837371826, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 930 + }, + { + "epoch": 0.7295304617772603, + "grad_norm": 0.289559006690979, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 940 + }, + { + "epoch": 0.737291424136593, + "grad_norm": 0.3000658452510834, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 950 + }, + { + "epoch": 0.7450523864959255, + "grad_norm": 0.33544042706489563, + "learning_rate": 0.0002, + "loss": 1.7736, + "step": 960 + }, + { + "epoch": 0.7528133488552581, + "grad_norm": 0.28593236207962036, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 970 + }, + { + "epoch": 0.7605743112145906, + "grad_norm": 0.313634991645813, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 980 + }, + { + "epoch": 0.7683352735739232, + "grad_norm": 0.2949385941028595, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 990 + }, + { + "epoch": 0.7760962359332557, + "grad_norm": 0.2920108437538147, + "learning_rate": 0.0002, + "loss": 1.8689, + "step": 1000 + }, + { + "epoch": 0.7838571982925883, + "grad_norm": 0.3245100677013397, + "learning_rate": 0.0002, + "loss": 1.8401, + "step": 1010 + }, + { + "epoch": 0.7916181606519208, + "grad_norm": 0.3007619380950928, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 1020 + }, + { + "epoch": 0.7993791230112534, + "grad_norm": 0.3630852997303009, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 1030 + }, + { + "epoch": 0.8071400853705859, + "grad_norm": 0.2856379747390747, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 1040 + }, + { + "epoch": 0.8149010477299186, + "grad_norm": 0.32476478815078735, + "learning_rate": 0.0002, + "loss": 1.8371, + "step": 1050 + }, + { + "epoch": 0.8226620100892511, + "grad_norm": 0.5162565112113953, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 1060 + }, + { + "epoch": 0.8304229724485837, + "grad_norm": 0.316496342420578, + "learning_rate": 0.0002, + "loss": 1.8862, + "step": 1070 + }, + { + "epoch": 0.8381839348079162, + "grad_norm": 0.31977516412734985, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 1080 + }, + { + "epoch": 0.8459448971672487, + "grad_norm": 0.269509494304657, + "learning_rate": 0.0002, + "loss": 1.8547, + "step": 1090 + }, + { + "epoch": 0.8537058595265813, + "grad_norm": 0.31621453166007996, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 1100 + }, + { + "epoch": 0.8614668218859138, + "grad_norm": 0.2946535050868988, + "learning_rate": 0.0002, + "loss": 1.739, + "step": 1110 + }, + { + "epoch": 0.8692277842452464, + "grad_norm": 0.3088909983634949, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 1120 + }, + { + "epoch": 0.8769887466045789, + "grad_norm": 0.33033716678619385, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 1130 + }, + { + "epoch": 0.8847497089639115, + "grad_norm": 0.2954833507537842, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1140 + }, + { + "epoch": 0.8925106713232441, + "grad_norm": 0.2950248122215271, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 1150 + }, + { + "epoch": 0.9002716336825767, + "grad_norm": 0.296661913394928, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 1160 + }, + { + "epoch": 0.9080325960419092, + "grad_norm": 0.35451310873031616, + "learning_rate": 0.0002, + "loss": 1.7967, + "step": 1170 + }, + { + "epoch": 0.9157935584012418, + "grad_norm": 0.32705947756767273, + "learning_rate": 0.0002, + "loss": 1.8202, + "step": 1180 + }, + { + "epoch": 0.9235545207605743, + "grad_norm": 0.3333960771560669, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1190 + }, + { + "epoch": 0.9313154831199069, + "grad_norm": 0.3042232096195221, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 1200 + }, + { + "epoch": 0.9390764454792394, + "grad_norm": 0.281553715467453, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1210 + }, + { + "epoch": 0.946837407838572, + "grad_norm": 0.3096391558647156, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 1220 + }, + { + "epoch": 0.9545983701979045, + "grad_norm": 0.2866271734237671, + "learning_rate": 0.0002, + "loss": 1.7401, + "step": 1230 + }, + { + "epoch": 0.962359332557237, + "grad_norm": 0.28394097089767456, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 1240 + }, + { + "epoch": 0.9701202949165697, + "grad_norm": 0.3249266743659973, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 1250 + }, + { + "epoch": 0.9778812572759022, + "grad_norm": 0.2896869480609894, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1260 + }, + { + "epoch": 0.9856422196352348, + "grad_norm": 0.29224586486816406, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 1270 + }, + { + "epoch": 0.9934031819945673, + "grad_norm": 0.2820223569869995, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1280 + }, + { + "epoch": 0.9996119518820333, + "eval_loss": 1.8081045150756836, + "eval_runtime": 102.3056, + "eval_samples_per_second": 4.956, + "eval_steps_per_second": 0.626, + "step": 1288 + }, + { + "epoch": 1.0011641443538999, + "grad_norm": 0.3282551169395447, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 1290 + }, + { + "epoch": 1.0089251067132325, + "grad_norm": 0.30217495560646057, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1300 + }, + { + "epoch": 1.016686069072565, + "grad_norm": 0.30801767110824585, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 1310 + }, + { + "epoch": 1.0244470314318976, + "grad_norm": 0.31816792488098145, + "learning_rate": 0.0002, + "loss": 1.7756, + "step": 1320 + }, + { + "epoch": 1.03220799379123, + "grad_norm": 0.27794334292411804, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 1330 + }, + { + "epoch": 1.0399689561505627, + "grad_norm": 0.3018926680088043, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 1340 + }, + { + "epoch": 1.0477299185098952, + "grad_norm": 0.3552975356578827, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1350 + }, + { + "epoch": 1.0554908808692278, + "grad_norm": 0.32590144872665405, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 1360 + }, + { + "epoch": 1.0632518432285603, + "grad_norm": 0.3435460925102234, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 1370 + }, + { + "epoch": 1.071012805587893, + "grad_norm": 0.35037797689437866, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 1380 + }, + { + "epoch": 1.0787737679472253, + "grad_norm": 0.31398263573646545, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 1390 + }, + { + "epoch": 1.086534730306558, + "grad_norm": 0.3134010434150696, + "learning_rate": 0.0002, + "loss": 1.6729, + "step": 1400 + }, + { + "epoch": 1.0942956926658907, + "grad_norm": 0.4599704444408417, + "learning_rate": 0.0002, + "loss": 1.751, + "step": 1410 + }, + { + "epoch": 1.102056655025223, + "grad_norm": 0.35852891206741333, + "learning_rate": 0.0002, + "loss": 1.6871, + "step": 1420 + }, + { + "epoch": 1.1098176173845558, + "grad_norm": 0.35628634691238403, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 1430 + }, + { + "epoch": 1.1175785797438882, + "grad_norm": 0.3769161105155945, + "learning_rate": 0.0002, + "loss": 1.6166, + "step": 1440 + }, + { + "epoch": 1.1253395421032208, + "grad_norm": 1.3712416887283325, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1450 + }, + { + "epoch": 1.1331005044625533, + "grad_norm": 0.38406670093536377, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 1460 + }, + { + "epoch": 1.140861466821886, + "grad_norm": 0.3402116000652313, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 1470 + }, + { + "epoch": 1.1486224291812184, + "grad_norm": 0.341189444065094, + "learning_rate": 0.0002, + "loss": 1.7074, + "step": 1480 + }, + { + "epoch": 1.156383391540551, + "grad_norm": 0.36629995703697205, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 1490 + }, + { + "epoch": 1.1641443538998835, + "grad_norm": 0.3499569296836853, + "learning_rate": 0.0002, + "loss": 1.6952, + "step": 1500 + }, + { + "epoch": 1.1719053162592161, + "grad_norm": 0.3663063943386078, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 1510 + }, + { + "epoch": 1.1796662786185488, + "grad_norm": 0.34851500391960144, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1520 + }, + { + "epoch": 1.1874272409778812, + "grad_norm": 0.35071656107902527, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 1530 + }, + { + "epoch": 1.1951882033372139, + "grad_norm": 0.42783796787261963, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 1540 + }, + { + "epoch": 1.2029491656965463, + "grad_norm": 0.31830692291259766, + "learning_rate": 0.0002, + "loss": 1.7499, + "step": 1550 + }, + { + "epoch": 1.210710128055879, + "grad_norm": 0.3597424626350403, + "learning_rate": 0.0002, + "loss": 1.7372, + "step": 1560 + }, + { + "epoch": 1.2184710904152114, + "grad_norm": 0.35233765840530396, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 1570 + }, + { + "epoch": 1.226232052774544, + "grad_norm": 0.35942912101745605, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 1580 + }, + { + "epoch": 1.2339930151338767, + "grad_norm": 0.36159393191337585, + "learning_rate": 0.0002, + "loss": 1.6598, + "step": 1590 + }, + { + "epoch": 1.2417539774932091, + "grad_norm": 0.3328469693660736, + "learning_rate": 0.0002, + "loss": 1.6697, + "step": 1600 + }, + { + "epoch": 1.2495149398525418, + "grad_norm": 0.3089476525783539, + "learning_rate": 0.0002, + "loss": 1.7594, + "step": 1610 + }, + { + "epoch": 1.2572759022118742, + "grad_norm": 0.30947765707969666, + "learning_rate": 0.0002, + "loss": 1.6805, + "step": 1620 + }, + { + "epoch": 1.265036864571207, + "grad_norm": 0.32154011726379395, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 1630 + }, + { + "epoch": 1.2727978269305393, + "grad_norm": 0.3480297923088074, + "learning_rate": 0.0002, + "loss": 1.6621, + "step": 1640 + }, + { + "epoch": 1.280558789289872, + "grad_norm": 0.39471694827079773, + "learning_rate": 0.0002, + "loss": 1.7087, + "step": 1650 + }, + { + "epoch": 1.2883197516492044, + "grad_norm": 0.35728853940963745, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 1660 + }, + { + "epoch": 1.296080714008537, + "grad_norm": 0.35223081707954407, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 1670 + }, + { + "epoch": 1.3038416763678695, + "grad_norm": 0.3588867485523224, + "learning_rate": 0.0002, + "loss": 1.7253, + "step": 1680 + }, + { + "epoch": 1.3116026387272022, + "grad_norm": 0.3528042733669281, + "learning_rate": 0.0002, + "loss": 1.6505, + "step": 1690 + }, + { + "epoch": 1.3193636010865348, + "grad_norm": 0.35975801944732666, + "learning_rate": 0.0002, + "loss": 1.6945, + "step": 1700 + }, + { + "epoch": 1.3271245634458673, + "grad_norm": 0.36691880226135254, + "learning_rate": 0.0002, + "loss": 1.6631, + "step": 1710 + }, + { + "epoch": 1.3348855258052, + "grad_norm": 0.3787977695465088, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1720 + }, + { + "epoch": 1.3426464881645324, + "grad_norm": 0.36614933609962463, + "learning_rate": 0.0002, + "loss": 1.7697, + "step": 1730 + }, + { + "epoch": 1.350407450523865, + "grad_norm": 0.3484745919704437, + "learning_rate": 0.0002, + "loss": 1.6487, + "step": 1740 + }, + { + "epoch": 1.3581684128831975, + "grad_norm": 0.36905673146247864, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 1750 + }, + { + "epoch": 1.36592937524253, + "grad_norm": 0.41564738750457764, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 1760 + }, + { + "epoch": 1.3736903376018628, + "grad_norm": 0.3345205783843994, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 1770 + }, + { + "epoch": 1.3814512999611952, + "grad_norm": 0.34926071763038635, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 1780 + }, + { + "epoch": 1.3892122623205276, + "grad_norm": 0.42004233598709106, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 1790 + }, + { + "epoch": 1.3969732246798603, + "grad_norm": 0.3576236963272095, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 1800 + }, + { + "epoch": 1.404734187039193, + "grad_norm": 0.3586704432964325, + "learning_rate": 0.0002, + "loss": 1.8516, + "step": 1810 + }, + { + "epoch": 1.4124951493985254, + "grad_norm": 0.3943439722061157, + "learning_rate": 0.0002, + "loss": 1.6171, + "step": 1820 + }, + { + "epoch": 1.420256111757858, + "grad_norm": 0.3484877049922943, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 1830 + }, + { + "epoch": 1.4280170741171905, + "grad_norm": 0.3344518840312958, + "learning_rate": 0.0002, + "loss": 1.7205, + "step": 1840 + }, + { + "epoch": 1.4357780364765231, + "grad_norm": 0.4345698356628418, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1850 + }, + { + "epoch": 1.4435389988358556, + "grad_norm": 0.5525162220001221, + "learning_rate": 0.0002, + "loss": 1.6855, + "step": 1860 + }, + { + "epoch": 1.4512999611951882, + "grad_norm": 0.37194496393203735, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1870 + }, + { + "epoch": 1.4590609235545209, + "grad_norm": 0.34570157527923584, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1880 + }, + { + "epoch": 1.4668218859138533, + "grad_norm": 0.3512282073497772, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 1890 + }, + { + "epoch": 1.4745828482731858, + "grad_norm": 0.3443922996520996, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1900 + }, + { + "epoch": 1.4823438106325184, + "grad_norm": 0.3812018036842346, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 1910 + }, + { + "epoch": 1.490104772991851, + "grad_norm": 0.39263492822647095, + "learning_rate": 0.0002, + "loss": 1.7277, + "step": 1920 + }, + { + "epoch": 1.4978657353511835, + "grad_norm": 0.3146156072616577, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1930 + }, + { + "epoch": 1.505626697710516, + "grad_norm": 0.3653988540172577, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 1940 + }, + { + "epoch": 1.5133876600698488, + "grad_norm": 0.3966596722602844, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 1950 + }, + { + "epoch": 1.5211486224291813, + "grad_norm": 0.3441697359085083, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1960 + }, + { + "epoch": 1.5289095847885137, + "grad_norm": 0.3328564465045929, + "learning_rate": 0.0002, + "loss": 1.7175, + "step": 1970 + }, + { + "epoch": 1.5366705471478463, + "grad_norm": 0.34068772196769714, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 1980 + }, + { + "epoch": 1.544431509507179, + "grad_norm": 0.3559795916080475, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 1990 + }, + { + "epoch": 1.5521924718665114, + "grad_norm": 0.37888768315315247, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 2000 + }, + { + "epoch": 1.5599534342258439, + "grad_norm": 0.36128363013267517, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 2010 + }, + { + "epoch": 1.5677143965851765, + "grad_norm": 0.3643714487552643, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 2020 + }, + { + "epoch": 1.5754753589445092, + "grad_norm": 0.3863612115383148, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 2030 + }, + { + "epoch": 1.5832363213038416, + "grad_norm": 0.32831457257270813, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 2040 + }, + { + "epoch": 1.5909972836631743, + "grad_norm": 0.36098113656044006, + "learning_rate": 0.0002, + "loss": 1.7404, + "step": 2050 + }, + { + "epoch": 1.598758246022507, + "grad_norm": 1.1079334020614624, + "learning_rate": 0.0002, + "loss": 1.7065, + "step": 2060 + }, + { + "epoch": 1.6065192083818394, + "grad_norm": 0.35615381598472595, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2070 + }, + { + "epoch": 1.6142801707411718, + "grad_norm": 0.369711309671402, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 2080 + }, + { + "epoch": 1.6220411331005045, + "grad_norm": 0.390658438205719, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 2090 + }, + { + "epoch": 1.6298020954598371, + "grad_norm": 0.3422999382019043, + "learning_rate": 0.0002, + "loss": 1.6996, + "step": 2100 + }, + { + "epoch": 1.6375630578191696, + "grad_norm": 0.372475266456604, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 2110 + }, + { + "epoch": 1.645324020178502, + "grad_norm": 0.35660576820373535, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 2120 + }, + { + "epoch": 1.6530849825378346, + "grad_norm": 0.35754942893981934, + "learning_rate": 0.0002, + "loss": 1.6991, + "step": 2130 + }, + { + "epoch": 1.6608459448971673, + "grad_norm": 0.34572410583496094, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 2140 + }, + { + "epoch": 1.6686069072564997, + "grad_norm": 0.42059701681137085, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 2150 + }, + { + "epoch": 1.6763678696158324, + "grad_norm": 0.35200759768486023, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 2160 + }, + { + "epoch": 1.684128831975165, + "grad_norm": 0.3704029321670532, + "learning_rate": 0.0002, + "loss": 1.6869, + "step": 2170 + }, + { + "epoch": 1.6918897943344975, + "grad_norm": 0.40450501441955566, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 2180 + }, + { + "epoch": 1.69965075669383, + "grad_norm": 0.362966924905777, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2190 + }, + { + "epoch": 1.7074117190531626, + "grad_norm": 0.36586204171180725, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 2200 + }, + { + "epoch": 1.7151726814124952, + "grad_norm": 0.3295372426509857, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 2210 + }, + { + "epoch": 1.7229336437718277, + "grad_norm": 0.3892575800418854, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 2220 + }, + { + "epoch": 1.73069460613116, + "grad_norm": 0.34712135791778564, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 2230 + }, + { + "epoch": 1.738455568490493, + "grad_norm": 0.34801796078681946, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 2240 + }, + { + "epoch": 1.7462165308498254, + "grad_norm": 0.3822397291660309, + "learning_rate": 0.0002, + "loss": 1.7009, + "step": 2250 + }, + { + "epoch": 1.7539774932091579, + "grad_norm": 0.38933250308036804, + "learning_rate": 0.0002, + "loss": 1.6546, + "step": 2260 + }, + { + "epoch": 1.7617384555684905, + "grad_norm": 0.3798373341560364, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 2270 + }, + { + "epoch": 1.7694994179278232, + "grad_norm": 0.35151317715644836, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 2280 + }, + { + "epoch": 1.7772603802871556, + "grad_norm": 0.44981494545936584, + "learning_rate": 0.0002, + "loss": 1.6894, + "step": 2290 + }, + { + "epoch": 1.785021342646488, + "grad_norm": 0.3992624580860138, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 2300 + }, + { + "epoch": 1.7927823050058207, + "grad_norm": 0.3772512376308441, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 2310 + }, + { + "epoch": 1.8005432673651534, + "grad_norm": 0.3511589467525482, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2320 + }, + { + "epoch": 1.8083042297244858, + "grad_norm": 0.3805285394191742, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 2330 + }, + { + "epoch": 1.8160651920838184, + "grad_norm": 0.3792071044445038, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 2340 + }, + { + "epoch": 1.823826154443151, + "grad_norm": 0.36430829763412476, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 2350 + }, + { + "epoch": 1.8315871168024835, + "grad_norm": 0.36502477526664734, + "learning_rate": 0.0002, + "loss": 1.6773, + "step": 2360 + }, + { + "epoch": 1.839348079161816, + "grad_norm": 0.35015153884887695, + "learning_rate": 0.0002, + "loss": 1.8072, + "step": 2370 + }, + { + "epoch": 1.8471090415211486, + "grad_norm": 0.3710903823375702, + "learning_rate": 0.0002, + "loss": 1.7734, + "step": 2380 + }, + { + "epoch": 1.8548700038804813, + "grad_norm": 0.3542828857898712, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 2390 + }, + { + "epoch": 1.8626309662398137, + "grad_norm": 0.35467568039894104, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 2400 + }, + { + "epoch": 1.8703919285991462, + "grad_norm": 0.3638560473918915, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 2410 + }, + { + "epoch": 1.8781528909584788, + "grad_norm": 0.3823298215866089, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 2420 + }, + { + "epoch": 1.8859138533178115, + "grad_norm": 0.3926416337490082, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 2430 + }, + { + "epoch": 1.893674815677144, + "grad_norm": 0.3608079254627228, + "learning_rate": 0.0002, + "loss": 1.71, + "step": 2440 + }, + { + "epoch": 1.9014357780364766, + "grad_norm": 0.3426613509654999, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 2450 + }, + { + "epoch": 1.9091967403958092, + "grad_norm": 0.3522338569164276, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 2460 + }, + { + "epoch": 1.9169577027551417, + "grad_norm": 0.3608049154281616, + "learning_rate": 0.0002, + "loss": 1.7307, + "step": 2470 + }, + { + "epoch": 1.924718665114474, + "grad_norm": 0.3849755525588989, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 2480 + }, + { + "epoch": 1.9324796274738067, + "grad_norm": 0.4154011011123657, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 2490 + }, + { + "epoch": 1.9402405898331394, + "grad_norm": 0.3602796792984009, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 2500 + }, + { + "epoch": 1.9480015521924718, + "grad_norm": 0.3702992796897888, + "learning_rate": 0.0002, + "loss": 1.7843, + "step": 2510 + }, + { + "epoch": 1.9557625145518043, + "grad_norm": 0.3657735288143158, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 2520 + }, + { + "epoch": 1.963523476911137, + "grad_norm": 0.41031739115715027, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2530 + }, + { + "epoch": 1.9712844392704696, + "grad_norm": 0.34578680992126465, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 2540 + }, + { + "epoch": 1.979045401629802, + "grad_norm": 0.3361521065235138, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 2550 + }, + { + "epoch": 1.9868063639891347, + "grad_norm": 0.34342363476753235, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 2560 + }, + { + "epoch": 1.9945673263484673, + "grad_norm": 0.32954007387161255, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 2570 + }, + { + "epoch": 2.0, + "eval_loss": 1.8068748712539673, + "eval_runtime": 105.5885, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 2577 + }, + { + "epoch": 2.0023282887077998, + "grad_norm": 0.336302250623703, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 2580 + }, + { + "epoch": 2.010089251067132, + "grad_norm": 0.3627048432826996, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 2590 + }, + { + "epoch": 2.017850213426465, + "grad_norm": 0.38406702876091003, + "learning_rate": 0.0002, + "loss": 1.4908, + "step": 2600 + }, + { + "epoch": 2.0256111757857975, + "grad_norm": 0.5326781272888184, + "learning_rate": 0.0002, + "loss": 1.5368, + "step": 2610 + }, + { + "epoch": 2.03337213814513, + "grad_norm": 0.4774554967880249, + "learning_rate": 0.0002, + "loss": 1.5727, + "step": 2620 + }, + { + "epoch": 2.0411331005044624, + "grad_norm": 0.4251810312271118, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 2630 + }, + { + "epoch": 2.0488940628637953, + "grad_norm": 0.4693007171154022, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 2640 + }, + { + "epoch": 2.0566550252231277, + "grad_norm": 0.46371519565582275, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 2650 + }, + { + "epoch": 2.06441598758246, + "grad_norm": 0.46652570366859436, + "learning_rate": 0.0002, + "loss": 1.6304, + "step": 2660 + }, + { + "epoch": 2.0721769499417926, + "grad_norm": 0.45200315117836, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 2670 + }, + { + "epoch": 2.0799379123011255, + "grad_norm": 0.42905205488204956, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 2680 + }, + { + "epoch": 2.087698874660458, + "grad_norm": 0.44509148597717285, + "learning_rate": 0.0002, + "loss": 1.5401, + "step": 2690 + }, + { + "epoch": 2.0954598370197903, + "grad_norm": 0.4445319175720215, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 2700 + }, + { + "epoch": 2.103220799379123, + "grad_norm": 0.46825504302978516, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 2710 + }, + { + "epoch": 2.1109817617384556, + "grad_norm": 0.4623856842517853, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 2720 + }, + { + "epoch": 2.118742724097788, + "grad_norm": 0.4833452105522156, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2730 + }, + { + "epoch": 2.1265036864571205, + "grad_norm": 0.4582686722278595, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 2740 + }, + { + "epoch": 2.1342646488164534, + "grad_norm": 0.47587934136390686, + "learning_rate": 0.0002, + "loss": 1.5801, + "step": 2750 + }, + { + "epoch": 2.142025611175786, + "grad_norm": 0.4602217972278595, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 2760 + }, + { + "epoch": 2.1497865735351183, + "grad_norm": 0.47501352429389954, + "learning_rate": 0.0002, + "loss": 1.5271, + "step": 2770 + }, + { + "epoch": 2.1575475358944507, + "grad_norm": 0.5078499913215637, + "learning_rate": 0.0002, + "loss": 1.4862, + "step": 2780 + }, + { + "epoch": 2.1653084982537836, + "grad_norm": 0.497704416513443, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 2790 + }, + { + "epoch": 2.173069460613116, + "grad_norm": 0.5435971617698669, + "learning_rate": 0.0002, + "loss": 1.5597, + "step": 2800 + }, + { + "epoch": 2.1808304229724484, + "grad_norm": 0.5172356367111206, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 2810 + }, + { + "epoch": 2.1885913853317813, + "grad_norm": 0.44063422083854675, + "learning_rate": 0.0002, + "loss": 1.5202, + "step": 2820 + }, + { + "epoch": 2.1963523476911138, + "grad_norm": 0.5079569220542908, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 2830 + }, + { + "epoch": 2.204113310050446, + "grad_norm": 0.45658132433891296, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2840 + }, + { + "epoch": 2.2118742724097786, + "grad_norm": 0.5103023648262024, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 2850 + }, + { + "epoch": 2.2196352347691115, + "grad_norm": 0.4882226288318634, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 2860 + }, + { + "epoch": 2.227396197128444, + "grad_norm": 0.5087296962738037, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 2870 + }, + { + "epoch": 2.2351571594877764, + "grad_norm": 0.45293712615966797, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 2880 + }, + { + "epoch": 2.242918121847109, + "grad_norm": 0.5120379328727722, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 2890 + }, + { + "epoch": 2.2506790842064417, + "grad_norm": 0.47126415371894836, + "learning_rate": 0.0002, + "loss": 1.5273, + "step": 2900 + }, + { + "epoch": 2.258440046565774, + "grad_norm": 0.44005846977233887, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 2910 + }, + { + "epoch": 2.2662010089251066, + "grad_norm": 0.46476176381111145, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 2920 + }, + { + "epoch": 2.2739619712844394, + "grad_norm": 0.48051515221595764, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 2930 + }, + { + "epoch": 2.281722933643772, + "grad_norm": 0.480069637298584, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2940 + }, + { + "epoch": 2.2894838960031043, + "grad_norm": 0.5122102499008179, + "learning_rate": 0.0002, + "loss": 1.5747, + "step": 2950 + }, + { + "epoch": 2.2972448583624367, + "grad_norm": 0.48879891633987427, + "learning_rate": 0.0002, + "loss": 1.5183, + "step": 2960 + }, + { + "epoch": 2.3050058207217696, + "grad_norm": 0.4973136782646179, + "learning_rate": 0.0002, + "loss": 1.5483, + "step": 2970 + }, + { + "epoch": 2.312766783081102, + "grad_norm": 0.5522695183753967, + "learning_rate": 0.0002, + "loss": 1.677, + "step": 2980 + }, + { + "epoch": 2.3205277454404345, + "grad_norm": 0.5220217704772949, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 2990 + }, + { + "epoch": 2.328288707799767, + "grad_norm": 0.4978662431240082, + "learning_rate": 0.0002, + "loss": 1.6299, + "step": 3000 + }, + { + "epoch": 2.3360496701591, + "grad_norm": 0.554053544998169, + "learning_rate": 0.0002, + "loss": 1.5498, + "step": 3010 + }, + { + "epoch": 2.3438106325184322, + "grad_norm": 0.4703886806964874, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 3020 + }, + { + "epoch": 2.3515715948777647, + "grad_norm": 0.5074123740196228, + "learning_rate": 0.0002, + "loss": 1.5418, + "step": 3030 + }, + { + "epoch": 2.3593325572370976, + "grad_norm": 0.5088278651237488, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 3040 + }, + { + "epoch": 2.36709351959643, + "grad_norm": 0.4752114415168762, + "learning_rate": 0.0002, + "loss": 1.5249, + "step": 3050 + }, + { + "epoch": 2.3748544819557624, + "grad_norm": 0.5121659636497498, + "learning_rate": 0.0002, + "loss": 1.5353, + "step": 3060 + }, + { + "epoch": 2.3826154443150953, + "grad_norm": 0.48649218678474426, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 3070 + }, + { + "epoch": 2.3903764066744277, + "grad_norm": 0.5209488868713379, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 3080 + }, + { + "epoch": 2.39813736903376, + "grad_norm": 0.5110517740249634, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 3090 + }, + { + "epoch": 2.4058983313930926, + "grad_norm": 0.5609337091445923, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3100 + }, + { + "epoch": 2.4136592937524255, + "grad_norm": 0.5191826224327087, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 3110 + }, + { + "epoch": 2.421420256111758, + "grad_norm": 0.4876069724559784, + "learning_rate": 0.0002, + "loss": 1.6347, + "step": 3120 + }, + { + "epoch": 2.4291812184710904, + "grad_norm": 0.4713933765888214, + "learning_rate": 0.0002, + "loss": 1.5565, + "step": 3130 + }, + { + "epoch": 2.436942180830423, + "grad_norm": 0.5102227330207825, + "learning_rate": 0.0002, + "loss": 1.6388, + "step": 3140 + }, + { + "epoch": 2.4447031431897557, + "grad_norm": 0.44546666741371155, + "learning_rate": 0.0002, + "loss": 1.5667, + "step": 3150 + }, + { + "epoch": 2.452464105549088, + "grad_norm": 0.5167558193206787, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3160 + }, + { + "epoch": 2.4602250679084205, + "grad_norm": 0.5226958990097046, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 3170 + }, + { + "epoch": 2.4679860302677534, + "grad_norm": 0.4751799702644348, + "learning_rate": 0.0002, + "loss": 1.5758, + "step": 3180 + }, + { + "epoch": 2.475746992627086, + "grad_norm": 0.4744729697704315, + "learning_rate": 0.0002, + "loss": 1.6234, + "step": 3190 + }, + { + "epoch": 2.4835079549864183, + "grad_norm": 0.5203230381011963, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 3200 + }, + { + "epoch": 2.4912689173457507, + "grad_norm": 0.47209781408309937, + "learning_rate": 0.0002, + "loss": 1.493, + "step": 3210 + }, + { + "epoch": 2.4990298797050836, + "grad_norm": 0.5241674780845642, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3220 + }, + { + "epoch": 2.506790842064416, + "grad_norm": 0.5152244567871094, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 3230 + }, + { + "epoch": 2.5145518044237485, + "grad_norm": 0.5216741561889648, + "learning_rate": 0.0002, + "loss": 1.6248, + "step": 3240 + }, + { + "epoch": 2.522312766783081, + "grad_norm": 0.4953259527683258, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 3250 + }, + { + "epoch": 2.530073729142414, + "grad_norm": 0.5973829030990601, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 3260 + }, + { + "epoch": 2.5378346915017462, + "grad_norm": 0.48804202675819397, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 3270 + }, + { + "epoch": 2.5455956538610787, + "grad_norm": 0.5334644317626953, + "learning_rate": 0.0002, + "loss": 1.4954, + "step": 3280 + }, + { + "epoch": 2.5533566162204115, + "grad_norm": 0.46873313188552856, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 3290 + }, + { + "epoch": 2.561117578579744, + "grad_norm": 0.4282589554786682, + "learning_rate": 0.0002, + "loss": 1.5362, + "step": 3300 + }, + { + "epoch": 2.5688785409390764, + "grad_norm": 0.4848293960094452, + "learning_rate": 0.0002, + "loss": 1.6278, + "step": 3310 + }, + { + "epoch": 2.576639503298409, + "grad_norm": 0.5093745589256287, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 3320 + }, + { + "epoch": 2.5844004656577413, + "grad_norm": 0.5084842443466187, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 3330 + }, + { + "epoch": 2.592161428017074, + "grad_norm": 0.4696281850337982, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 3340 + }, + { + "epoch": 2.5999223903764066, + "grad_norm": 0.5767765641212463, + "learning_rate": 0.0002, + "loss": 1.5359, + "step": 3350 + }, + { + "epoch": 2.607683352735739, + "grad_norm": 0.47300875186920166, + "learning_rate": 0.0002, + "loss": 1.6097, + "step": 3360 + }, + { + "epoch": 2.615444315095072, + "grad_norm": 0.4809158146381378, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 3370 + }, + { + "epoch": 2.6232052774544043, + "grad_norm": 0.5141063928604126, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 3380 + }, + { + "epoch": 2.630966239813737, + "grad_norm": 0.4832935035228729, + "learning_rate": 0.0002, + "loss": 1.5784, + "step": 3390 + }, + { + "epoch": 2.6387272021730697, + "grad_norm": 0.5044625401496887, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 3400 + }, + { + "epoch": 2.646488164532402, + "grad_norm": 0.5287680625915527, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 3410 + }, + { + "epoch": 2.6542491268917345, + "grad_norm": 0.5306379795074463, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 3420 + }, + { + "epoch": 2.662010089251067, + "grad_norm": 0.5849291682243347, + "learning_rate": 0.0002, + "loss": 1.5264, + "step": 3430 + }, + { + "epoch": 2.6697710516104, + "grad_norm": 0.7951080799102783, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 3440 + }, + { + "epoch": 2.6775320139697323, + "grad_norm": 0.48087653517723083, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 3450 + }, + { + "epoch": 2.6852929763290647, + "grad_norm": 0.5396431684494019, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 3460 + }, + { + "epoch": 2.693053938688397, + "grad_norm": 0.5481634736061096, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 3470 + }, + { + "epoch": 2.70081490104773, + "grad_norm": 0.5068731307983398, + "learning_rate": 0.0002, + "loss": 1.6436, + "step": 3480 + }, + { + "epoch": 2.7085758634070625, + "grad_norm": 0.5759826898574829, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 3490 + }, + { + "epoch": 2.716336825766395, + "grad_norm": 0.7253932952880859, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 3500 + }, + { + "epoch": 2.724097788125728, + "grad_norm": 0.527745246887207, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 3510 + }, + { + "epoch": 2.73185875048506, + "grad_norm": 0.5279242396354675, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 3520 + }, + { + "epoch": 2.7396197128443927, + "grad_norm": 0.5047839283943176, + "learning_rate": 0.0002, + "loss": 1.6768, + "step": 3530 + }, + { + "epoch": 2.7473806752037255, + "grad_norm": 0.5430883169174194, + "learning_rate": 0.0002, + "loss": 1.5517, + "step": 3540 + }, + { + "epoch": 2.755141637563058, + "grad_norm": 0.4496723711490631, + "learning_rate": 0.0002, + "loss": 1.5624, + "step": 3550 + }, + { + "epoch": 2.7629025999223904, + "grad_norm": 0.5063338875770569, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 3560 + }, + { + "epoch": 2.770663562281723, + "grad_norm": 0.4619026780128479, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 3570 + }, + { + "epoch": 2.7784245246410553, + "grad_norm": 0.4753304123878479, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 3580 + }, + { + "epoch": 2.786185487000388, + "grad_norm": 0.5422708988189697, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 3590 + }, + { + "epoch": 2.7939464493597206, + "grad_norm": 0.4756578803062439, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 3600 + }, + { + "epoch": 2.801707411719053, + "grad_norm": 0.5057567358016968, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 3610 + }, + { + "epoch": 2.809468374078386, + "grad_norm": 0.5410919785499573, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 3620 + }, + { + "epoch": 2.8172293364377183, + "grad_norm": 0.4958136975765228, + "learning_rate": 0.0002, + "loss": 1.5573, + "step": 3630 + }, + { + "epoch": 2.8249902987970508, + "grad_norm": 0.454527348279953, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 3640 + }, + { + "epoch": 2.8327512611563836, + "grad_norm": 0.5092706084251404, + "learning_rate": 0.0002, + "loss": 1.5582, + "step": 3650 + }, + { + "epoch": 2.840512223515716, + "grad_norm": 0.5314022302627563, + "learning_rate": 0.0002, + "loss": 1.5893, + "step": 3660 + }, + { + "epoch": 2.8482731858750485, + "grad_norm": 0.5028239488601685, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 3670 + }, + { + "epoch": 2.856034148234381, + "grad_norm": 0.5127444863319397, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 3680 + }, + { + "epoch": 2.8637951105937134, + "grad_norm": 0.5045645236968994, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 3690 + }, + { + "epoch": 2.8715560729530463, + "grad_norm": 0.5560781955718994, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 3700 + }, + { + "epoch": 2.8793170353123787, + "grad_norm": 0.5177600383758545, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 3710 + }, + { + "epoch": 2.887077997671711, + "grad_norm": 0.45830899477005005, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 3720 + }, + { + "epoch": 2.894838960031044, + "grad_norm": 0.4828629195690155, + "learning_rate": 0.0002, + "loss": 1.6344, + "step": 3730 + }, + { + "epoch": 2.9025999223903765, + "grad_norm": 0.48241183161735535, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 3740 + }, + { + "epoch": 2.910360884749709, + "grad_norm": 0.4909592568874359, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 3750 + }, + { + "epoch": 2.9181218471090418, + "grad_norm": 0.44677025079727173, + "learning_rate": 0.0002, + "loss": 1.4927, + "step": 3760 + }, + { + "epoch": 2.925882809468374, + "grad_norm": 0.4928834140300751, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 3770 + }, + { + "epoch": 2.9336437718277066, + "grad_norm": 0.5673553347587585, + "learning_rate": 0.0002, + "loss": 1.5843, + "step": 3780 + }, + { + "epoch": 2.941404734187039, + "grad_norm": 0.548190712928772, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 3790 + }, + { + "epoch": 2.9491656965463715, + "grad_norm": 0.48979803919792175, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 3800 + }, + { + "epoch": 2.9569266589057044, + "grad_norm": 0.533191978931427, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 3810 + }, + { + "epoch": 2.964687621265037, + "grad_norm": 0.5362946391105652, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 3820 + }, + { + "epoch": 2.9724485836243693, + "grad_norm": 0.4724906384944916, + "learning_rate": 0.0002, + "loss": 1.6602, + "step": 3830 + }, + { + "epoch": 2.980209545983702, + "grad_norm": 0.5468461513519287, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 3840 + }, + { + "epoch": 2.9879705083430346, + "grad_norm": 0.4697108864784241, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 3850 + }, + { + "epoch": 2.995731470702367, + "grad_norm": 0.4780906140804291, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 3860 + }, + { + "epoch": 2.9996119518820334, + "eval_loss": 1.8472607135772705, + "eval_runtime": 106.5541, + "eval_samples_per_second": 4.758, + "eval_steps_per_second": 0.601, + "step": 3865 + }, + { + "epoch": 3.0034924330616994, + "grad_norm": 0.5645653605461121, + "learning_rate": 0.0002, + "loss": 1.4983, + "step": 3870 + }, + { + "epoch": 3.0112533954210323, + "grad_norm": 0.6457151174545288, + "learning_rate": 0.0002, + "loss": 1.4334, + "step": 3880 + }, + { + "epoch": 3.0190143577803648, + "grad_norm": 0.583838164806366, + "learning_rate": 0.0002, + "loss": 1.3899, + "step": 3890 + }, + { + "epoch": 3.026775320139697, + "grad_norm": 0.6819260120391846, + "learning_rate": 0.0002, + "loss": 1.3258, + "step": 3900 + }, + { + "epoch": 3.03453628249903, + "grad_norm": 0.6692903637886047, + "learning_rate": 0.0002, + "loss": 1.3458, + "step": 3910 + }, + { + "epoch": 3.0422972448583625, + "grad_norm": 0.6101024746894836, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 3920 + }, + { + "epoch": 3.050058207217695, + "grad_norm": 0.7014093399047852, + "learning_rate": 0.0002, + "loss": 1.394, + "step": 3930 + }, + { + "epoch": 3.0578191695770274, + "grad_norm": 0.7380381226539612, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 3940 + }, + { + "epoch": 3.0655801319363603, + "grad_norm": 0.6607900857925415, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 3950 + }, + { + "epoch": 3.0733410942956927, + "grad_norm": 0.735263466835022, + "learning_rate": 0.0002, + "loss": 1.4293, + "step": 3960 + }, + { + "epoch": 3.081102056655025, + "grad_norm": 0.6788513660430908, + "learning_rate": 0.0002, + "loss": 1.3966, + "step": 3970 + }, + { + "epoch": 3.088863019014358, + "grad_norm": 0.6347652673721313, + "learning_rate": 0.0002, + "loss": 1.3435, + "step": 3980 + }, + { + "epoch": 3.0966239813736904, + "grad_norm": 0.7056642770767212, + "learning_rate": 0.0002, + "loss": 1.4518, + "step": 3990 + }, + { + "epoch": 3.104384943733023, + "grad_norm": 0.6387075185775757, + "learning_rate": 0.0002, + "loss": 1.4474, + "step": 4000 + }, + { + "epoch": 3.1121459060923553, + "grad_norm": 0.6701116561889648, + "learning_rate": 0.0002, + "loss": 1.3833, + "step": 4010 + }, + { + "epoch": 3.119906868451688, + "grad_norm": 0.7558449506759644, + "learning_rate": 0.0002, + "loss": 1.404, + "step": 4020 + }, + { + "epoch": 3.1276678308110206, + "grad_norm": 0.6612881422042847, + "learning_rate": 0.0002, + "loss": 1.3294, + "step": 4030 + }, + { + "epoch": 3.135428793170353, + "grad_norm": 0.7474587559700012, + "learning_rate": 0.0002, + "loss": 1.439, + "step": 4040 + }, + { + "epoch": 3.1431897555296855, + "grad_norm": 0.7292373776435852, + "learning_rate": 0.0002, + "loss": 1.4616, + "step": 4050 + }, + { + "epoch": 3.1509507178890184, + "grad_norm": 0.7432886958122253, + "learning_rate": 0.0002, + "loss": 1.3908, + "step": 4060 + }, + { + "epoch": 3.158711680248351, + "grad_norm": 0.6366098523139954, + "learning_rate": 0.0002, + "loss": 1.4214, + "step": 4070 + }, + { + "epoch": 3.1664726426076832, + "grad_norm": 0.6837611794471741, + "learning_rate": 0.0002, + "loss": 1.5044, + "step": 4080 + }, + { + "epoch": 3.174233604967016, + "grad_norm": 0.7194393277168274, + "learning_rate": 0.0002, + "loss": 1.4332, + "step": 4090 + }, + { + "epoch": 3.1819945673263486, + "grad_norm": 0.6963607668876648, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 4100 + }, + { + "epoch": 3.189755529685681, + "grad_norm": 0.6404902935028076, + "learning_rate": 0.0002, + "loss": 1.4127, + "step": 4110 + }, + { + "epoch": 3.1975164920450134, + "grad_norm": 0.7172070741653442, + "learning_rate": 0.0002, + "loss": 1.4394, + "step": 4120 + }, + { + "epoch": 3.2052774544043463, + "grad_norm": 0.6577759385108948, + "learning_rate": 0.0002, + "loss": 1.4658, + "step": 4130 + }, + { + "epoch": 3.2130384167636787, + "grad_norm": 0.6658480167388916, + "learning_rate": 0.0002, + "loss": 1.4019, + "step": 4140 + }, + { + "epoch": 3.220799379123011, + "grad_norm": 0.6771699786186218, + "learning_rate": 0.0002, + "loss": 1.4348, + "step": 4150 + }, + { + "epoch": 3.2285603414823436, + "grad_norm": 0.699035108089447, + "learning_rate": 0.0002, + "loss": 1.4736, + "step": 4160 + }, + { + "epoch": 3.2363213038416765, + "grad_norm": 0.7218514680862427, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 4170 + }, + { + "epoch": 3.244082266201009, + "grad_norm": 0.6270631551742554, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 4180 + }, + { + "epoch": 3.2518432285603414, + "grad_norm": 0.6828921437263489, + "learning_rate": 0.0002, + "loss": 1.4076, + "step": 4190 + }, + { + "epoch": 3.2596041909196742, + "grad_norm": 0.6005498170852661, + "learning_rate": 0.0002, + "loss": 1.4663, + "step": 4200 + }, + { + "epoch": 3.2673651532790067, + "grad_norm": 0.6974790692329407, + "learning_rate": 0.0002, + "loss": 1.4798, + "step": 4210 + }, + { + "epoch": 3.275126115638339, + "grad_norm": 0.7269543409347534, + "learning_rate": 0.0002, + "loss": 1.5012, + "step": 4220 + }, + { + "epoch": 3.2828870779976715, + "grad_norm": 0.6728787422180176, + "learning_rate": 0.0002, + "loss": 1.3848, + "step": 4230 + }, + { + "epoch": 3.2906480403570044, + "grad_norm": 0.676972508430481, + "learning_rate": 0.0002, + "loss": 1.4112, + "step": 4240 + }, + { + "epoch": 3.298409002716337, + "grad_norm": 0.748309314250946, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 4250 + }, + { + "epoch": 3.3061699650756693, + "grad_norm": 0.6976589560508728, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 4260 + }, + { + "epoch": 3.3139309274350017, + "grad_norm": 0.649780809879303, + "learning_rate": 0.0002, + "loss": 1.3967, + "step": 4270 + }, + { + "epoch": 3.3216918897943346, + "grad_norm": 0.6529902815818787, + "learning_rate": 0.0002, + "loss": 1.327, + "step": 4280 + }, + { + "epoch": 3.329452852153667, + "grad_norm": 0.9273163676261902, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 4290 + }, + { + "epoch": 3.3372138145129995, + "grad_norm": 0.717024028301239, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 4300 + }, + { + "epoch": 3.3449747768723324, + "grad_norm": 0.7914950251579285, + "learning_rate": 0.0002, + "loss": 1.4441, + "step": 4310 + }, + { + "epoch": 3.352735739231665, + "grad_norm": 0.7133203148841858, + "learning_rate": 0.0002, + "loss": 1.432, + "step": 4320 + }, + { + "epoch": 3.3604967015909972, + "grad_norm": 0.7409568428993225, + "learning_rate": 0.0002, + "loss": 1.4662, + "step": 4330 + }, + { + "epoch": 3.3682576639503297, + "grad_norm": 0.6993981003761292, + "learning_rate": 0.0002, + "loss": 1.3992, + "step": 4340 + }, + { + "epoch": 3.3760186263096625, + "grad_norm": 0.7114535570144653, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 4350 + }, + { + "epoch": 3.383779588668995, + "grad_norm": 0.6790860295295715, + "learning_rate": 0.0002, + "loss": 1.4227, + "step": 4360 + }, + { + "epoch": 3.3915405510283274, + "grad_norm": 0.6507849097251892, + "learning_rate": 0.0002, + "loss": 1.4128, + "step": 4370 + }, + { + "epoch": 3.39930151338766, + "grad_norm": 0.5967804193496704, + "learning_rate": 0.0002, + "loss": 1.4559, + "step": 4380 + }, + { + "epoch": 3.4070624757469927, + "grad_norm": 0.6625847816467285, + "learning_rate": 0.0002, + "loss": 1.3687, + "step": 4390 + }, + { + "epoch": 3.414823438106325, + "grad_norm": 0.6736508011817932, + "learning_rate": 0.0002, + "loss": 1.4193, + "step": 4400 + }, + { + "epoch": 3.4225844004656576, + "grad_norm": 0.7870860695838928, + "learning_rate": 0.0002, + "loss": 1.4363, + "step": 4410 + }, + { + "epoch": 3.4303453628249905, + "grad_norm": 0.7205295562744141, + "learning_rate": 0.0002, + "loss": 1.4114, + "step": 4420 + }, + { + "epoch": 3.438106325184323, + "grad_norm": 0.6634634137153625, + "learning_rate": 0.0002, + "loss": 1.4131, + "step": 4430 + }, + { + "epoch": 3.4458672875436553, + "grad_norm": 0.7562733292579651, + "learning_rate": 0.0002, + "loss": 1.4683, + "step": 4440 + }, + { + "epoch": 3.453628249902988, + "grad_norm": 0.6585879921913147, + "learning_rate": 0.0002, + "loss": 1.3486, + "step": 4450 + }, + { + "epoch": 3.4613892122623207, + "grad_norm": 0.6896792054176331, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 4460 + }, + { + "epoch": 3.469150174621653, + "grad_norm": 0.6520342230796814, + "learning_rate": 0.0002, + "loss": 1.4208, + "step": 4470 + }, + { + "epoch": 3.4769111369809855, + "grad_norm": 0.6760806441307068, + "learning_rate": 0.0002, + "loss": 1.3423, + "step": 4480 + }, + { + "epoch": 3.484672099340318, + "grad_norm": 0.7539774179458618, + "learning_rate": 0.0002, + "loss": 1.4398, + "step": 4490 + }, + { + "epoch": 3.492433061699651, + "grad_norm": 0.7409411668777466, + "learning_rate": 0.0002, + "loss": 1.4534, + "step": 4500 + }, + { + "epoch": 3.5001940240589833, + "grad_norm": 0.6876253485679626, + "learning_rate": 0.0002, + "loss": 1.4069, + "step": 4510 + }, + { + "epoch": 3.5079549864183157, + "grad_norm": 0.7028461694717407, + "learning_rate": 0.0002, + "loss": 1.4228, + "step": 4520 + }, + { + "epoch": 3.5157159487776486, + "grad_norm": 0.8056529760360718, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 4530 + }, + { + "epoch": 3.523476911136981, + "grad_norm": 0.711338996887207, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 4540 + }, + { + "epoch": 3.5312378734963135, + "grad_norm": 0.7343552708625793, + "learning_rate": 0.0002, + "loss": 1.5247, + "step": 4550 + }, + { + "epoch": 3.5389988358556463, + "grad_norm": 0.745479941368103, + "learning_rate": 0.0002, + "loss": 1.4308, + "step": 4560 + }, + { + "epoch": 3.5467597982149788, + "grad_norm": 0.7582294940948486, + "learning_rate": 0.0002, + "loss": 1.4229, + "step": 4570 + }, + { + "epoch": 3.554520760574311, + "grad_norm": 0.6717444658279419, + "learning_rate": 0.0002, + "loss": 1.4127, + "step": 4580 + }, + { + "epoch": 3.5622817229336436, + "grad_norm": 0.7417883276939392, + "learning_rate": 0.0002, + "loss": 1.4368, + "step": 4590 + }, + { + "epoch": 3.570042685292976, + "grad_norm": 0.6385737061500549, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 4600 + }, + { + "epoch": 3.577803647652309, + "grad_norm": 0.716704249382019, + "learning_rate": 0.0002, + "loss": 1.3981, + "step": 4610 + }, + { + "epoch": 3.5855646100116414, + "grad_norm": 0.6948980093002319, + "learning_rate": 0.0002, + "loss": 1.3889, + "step": 4620 + }, + { + "epoch": 3.593325572370974, + "grad_norm": 0.6961140036582947, + "learning_rate": 0.0002, + "loss": 1.5177, + "step": 4630 + }, + { + "epoch": 3.6010865347303067, + "grad_norm": 0.7493122220039368, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 4640 + }, + { + "epoch": 3.608847497089639, + "grad_norm": 0.7431658506393433, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 4650 + }, + { + "epoch": 3.6166084594489716, + "grad_norm": 0.8353387713432312, + "learning_rate": 0.0002, + "loss": 1.4551, + "step": 4660 + }, + { + "epoch": 3.6243694218083045, + "grad_norm": 0.7095612287521362, + "learning_rate": 0.0002, + "loss": 1.4533, + "step": 4670 + }, + { + "epoch": 3.632130384167637, + "grad_norm": 0.776620090007782, + "learning_rate": 0.0002, + "loss": 1.4003, + "step": 4680 + }, + { + "epoch": 3.6398913465269693, + "grad_norm": 0.7198925018310547, + "learning_rate": 0.0002, + "loss": 1.4361, + "step": 4690 + }, + { + "epoch": 3.6476523088863018, + "grad_norm": 0.8238834738731384, + "learning_rate": 0.0002, + "loss": 1.4543, + "step": 4700 + }, + { + "epoch": 3.655413271245634, + "grad_norm": 0.6804245710372925, + "learning_rate": 0.0002, + "loss": 1.3958, + "step": 4710 + }, + { + "epoch": 3.663174233604967, + "grad_norm": 0.8444845676422119, + "learning_rate": 0.0002, + "loss": 1.4158, + "step": 4720 + }, + { + "epoch": 3.6709351959642995, + "grad_norm": 0.743797779083252, + "learning_rate": 0.0002, + "loss": 1.3825, + "step": 4730 + }, + { + "epoch": 3.678696158323632, + "grad_norm": 0.8994188904762268, + "learning_rate": 0.0002, + "loss": 1.4213, + "step": 4740 + }, + { + "epoch": 3.686457120682965, + "grad_norm": 0.75416100025177, + "learning_rate": 0.0002, + "loss": 1.4281, + "step": 4750 + }, + { + "epoch": 3.6942180830422973, + "grad_norm": 0.6499266028404236, + "learning_rate": 0.0002, + "loss": 1.4154, + "step": 4760 + }, + { + "epoch": 3.7019790454016297, + "grad_norm": 0.7246791124343872, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 4770 + }, + { + "epoch": 3.7097400077609626, + "grad_norm": 0.7831124067306519, + "learning_rate": 0.0002, + "loss": 1.426, + "step": 4780 + }, + { + "epoch": 3.717500970120295, + "grad_norm": 0.7130028009414673, + "learning_rate": 0.0002, + "loss": 1.3933, + "step": 4790 + }, + { + "epoch": 3.7252619324796274, + "grad_norm": 0.7501602172851562, + "learning_rate": 0.0002, + "loss": 1.4632, + "step": 4800 + }, + { + "epoch": 3.73302289483896, + "grad_norm": 0.6980932950973511, + "learning_rate": 0.0002, + "loss": 1.4985, + "step": 4810 + }, + { + "epoch": 3.7407838571982923, + "grad_norm": 0.8050530552864075, + "learning_rate": 0.0002, + "loss": 1.4517, + "step": 4820 + }, + { + "epoch": 3.748544819557625, + "grad_norm": 0.6385579705238342, + "learning_rate": 0.0002, + "loss": 1.4703, + "step": 4830 + }, + { + "epoch": 3.7563057819169576, + "grad_norm": 0.6664714813232422, + "learning_rate": 0.0002, + "loss": 1.5281, + "step": 4840 + }, + { + "epoch": 3.76406674427629, + "grad_norm": 0.7125676274299622, + "learning_rate": 0.0002, + "loss": 1.4443, + "step": 4850 + }, + { + "epoch": 3.771827706635623, + "grad_norm": 0.7231866717338562, + "learning_rate": 0.0002, + "loss": 1.3958, + "step": 4860 + }, + { + "epoch": 3.7795886689949554, + "grad_norm": 0.6917183995246887, + "learning_rate": 0.0002, + "loss": 1.4446, + "step": 4870 + }, + { + "epoch": 3.787349631354288, + "grad_norm": 0.665037989616394, + "learning_rate": 0.0002, + "loss": 1.4369, + "step": 4880 + }, + { + "epoch": 3.7951105937136207, + "grad_norm": 0.5837726593017578, + "learning_rate": 0.0002, + "loss": 1.4193, + "step": 4890 + }, + { + "epoch": 3.802871556072953, + "grad_norm": 0.6366701722145081, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 4900 + }, + { + "epoch": 3.8106325184322856, + "grad_norm": 0.7082223892211914, + "learning_rate": 0.0002, + "loss": 1.46, + "step": 4910 + }, + { + "epoch": 3.818393480791618, + "grad_norm": 0.8101672530174255, + "learning_rate": 0.0002, + "loss": 1.5139, + "step": 4920 + }, + { + "epoch": 3.826154443150951, + "grad_norm": 0.7516148090362549, + "learning_rate": 0.0002, + "loss": 1.3659, + "step": 4930 + }, + { + "epoch": 3.8339154055102833, + "grad_norm": 0.7928489446640015, + "learning_rate": 0.0002, + "loss": 1.3909, + "step": 4940 + }, + { + "epoch": 3.8416763678696157, + "grad_norm": 0.6892234683036804, + "learning_rate": 0.0002, + "loss": 1.4255, + "step": 4950 + }, + { + "epoch": 3.849437330228948, + "grad_norm": 0.6381304264068604, + "learning_rate": 0.0002, + "loss": 1.5024, + "step": 4960 + }, + { + "epoch": 3.857198292588281, + "grad_norm": 0.8068831562995911, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 4970 + }, + { + "epoch": 3.8649592549476135, + "grad_norm": 0.7289869785308838, + "learning_rate": 0.0002, + "loss": 1.45, + "step": 4980 + }, + { + "epoch": 3.872720217306946, + "grad_norm": 0.7278549075126648, + "learning_rate": 0.0002, + "loss": 1.398, + "step": 4990 + }, + { + "epoch": 3.880481179666279, + "grad_norm": 0.7324236631393433, + "learning_rate": 0.0002, + "loss": 1.4442, + "step": 5000 + }, + { + "epoch": 3.8882421420256112, + "grad_norm": 0.6759871244430542, + "learning_rate": 0.0002, + "loss": 1.4511, + "step": 5010 + }, + { + "epoch": 3.8960031043849437, + "grad_norm": 0.8159207701683044, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 5020 + }, + { + "epoch": 3.9037640667442766, + "grad_norm": 0.6536211967468262, + "learning_rate": 0.0002, + "loss": 1.4685, + "step": 5030 + }, + { + "epoch": 3.911525029103609, + "grad_norm": 0.6827932000160217, + "learning_rate": 0.0002, + "loss": 1.4335, + "step": 5040 + }, + { + "epoch": 3.9192859914629414, + "grad_norm": 0.6688340306282043, + "learning_rate": 0.0002, + "loss": 1.433, + "step": 5050 + }, + { + "epoch": 3.927046953822274, + "grad_norm": 0.6385695934295654, + "learning_rate": 0.0002, + "loss": 1.4099, + "step": 5060 + }, + { + "epoch": 3.9348079161816063, + "grad_norm": 0.6975107192993164, + "learning_rate": 0.0002, + "loss": 1.4767, + "step": 5070 + }, + { + "epoch": 3.942568878540939, + "grad_norm": 0.6684112548828125, + "learning_rate": 0.0002, + "loss": 1.4893, + "step": 5080 + }, + { + "epoch": 3.9503298409002716, + "grad_norm": 0.8349628448486328, + "learning_rate": 0.0002, + "loss": 1.4732, + "step": 5090 + }, + { + "epoch": 3.958090803259604, + "grad_norm": 0.7146425843238831, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 5100 + }, + { + "epoch": 3.965851765618937, + "grad_norm": 0.6555036902427673, + "learning_rate": 0.0002, + "loss": 1.4149, + "step": 5110 + }, + { + "epoch": 3.9736127279782694, + "grad_norm": 0.7037415504455566, + "learning_rate": 0.0002, + "loss": 1.4274, + "step": 5120 + }, + { + "epoch": 3.981373690337602, + "grad_norm": 0.7235575914382935, + "learning_rate": 0.0002, + "loss": 1.4292, + "step": 5130 + }, + { + "epoch": 3.9891346526969347, + "grad_norm": 0.7092325687408447, + "learning_rate": 0.0002, + "loss": 1.4455, + "step": 5140 + }, + { + "epoch": 3.996895615056267, + "grad_norm": 0.7490319609642029, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 5150 + }, + { + "epoch": 4.0, + "eval_loss": 1.9131355285644531, + "eval_runtime": 105.5778, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 5154 + } + ], + "logging_steps": 10, + "max_steps": 10304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.6472874496753664e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8ca6e2c3ac58fa2af9f99747566f932f41a5a4d5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-5154/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f7feb06ff53d5bf79374054a25b662309e705a2ca08dfa3b0bce7b8b4632fae +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..abe4ebc4beb969d5267a45992ec656a4a68b1fd9 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e402faa5f84824ca939ad7b1591b51dd6860ad6548e9197d4604e851281827f5 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..626a7ec7a3016789a3fc36fdf676584a6548c853 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16e2d815ba587c0cc13e35c2f1ab31b4cf7397e0816df4838bbadfb4acc88060 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..af8ad5825e9f12e91d91c25a57b7f1946b85b72e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a1f14aee1403446037fc0dbcc8de020d13071880a931dead057e5c41bed610f +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e4b23fd9d976993d82932d2cd457286e4531bcc --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ff08e7f23e0e79786a6547122bbe0f72d7de03e3cf2d4a4088c28046431d6b1 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ef5191874f9128d31f7e901f5f6178901b2d1c15 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/trainer_state.json @@ -0,0 +1,4581 @@ +{ + "best_metric": 1.8068748712539673, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577", + "epoch": 4.9996119518820334, + "eval_steps": 10, + "global_step": 6442, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007760962359332557, + "grad_norm": 1.0751162767410278, + "learning_rate": 0.0002, + "loss": 3.0855, + "step": 10 + }, + { + "epoch": 0.015521924718665115, + "grad_norm": 0.4697345793247223, + "learning_rate": 0.0002, + "loss": 2.4744, + "step": 20 + }, + { + "epoch": 0.023282887077997673, + "grad_norm": 0.5370839238166809, + "learning_rate": 0.0002, + "loss": 2.193, + "step": 30 + }, + { + "epoch": 0.03104384943733023, + "grad_norm": 0.46794816851615906, + "learning_rate": 0.0002, + "loss": 2.0599, + "step": 40 + }, + { + "epoch": 0.038804811796662786, + "grad_norm": 0.44624820351600647, + "learning_rate": 0.0002, + "loss": 1.9354, + "step": 50 + }, + { + "epoch": 0.046565774155995346, + "grad_norm": 0.3953201472759247, + "learning_rate": 0.0002, + "loss": 1.9319, + "step": 60 + }, + { + "epoch": 0.0543267365153279, + "grad_norm": 0.3935912549495697, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 70 + }, + { + "epoch": 0.06208769887466046, + "grad_norm": 0.4520699381828308, + "learning_rate": 0.0002, + "loss": 1.8795, + "step": 80 + }, + { + "epoch": 0.06984866123399301, + "grad_norm": 0.3801847994327545, + "learning_rate": 0.0002, + "loss": 1.8354, + "step": 90 + }, + { + "epoch": 0.07760962359332557, + "grad_norm": 0.4020165205001831, + "learning_rate": 0.0002, + "loss": 1.9053, + "step": 100 + }, + { + "epoch": 0.08537058595265813, + "grad_norm": 0.3860672116279602, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 110 + }, + { + "epoch": 0.09313154831199069, + "grad_norm": 0.3681113123893738, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 120 + }, + { + "epoch": 0.10089251067132324, + "grad_norm": 0.3594866991043091, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 130 + }, + { + "epoch": 0.1086534730306558, + "grad_norm": 0.3879193663597107, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 140 + }, + { + "epoch": 0.11641443538998836, + "grad_norm": 0.3270505666732788, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 150 + }, + { + "epoch": 0.12417539774932092, + "grad_norm": 0.36824458837509155, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 160 + }, + { + "epoch": 0.13193636010865348, + "grad_norm": 0.383882075548172, + "learning_rate": 0.0002, + "loss": 1.8305, + "step": 170 + }, + { + "epoch": 0.13969732246798602, + "grad_norm": 0.3368665874004364, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 180 + }, + { + "epoch": 0.1474582848273186, + "grad_norm": 0.35961097478866577, + "learning_rate": 0.0002, + "loss": 1.7882, + "step": 190 + }, + { + "epoch": 0.15521924718665114, + "grad_norm": 0.3415963351726532, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 200 + }, + { + "epoch": 0.1629802095459837, + "grad_norm": 0.4100632071495056, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 210 + }, + { + "epoch": 0.17074117190531626, + "grad_norm": 0.3516307473182678, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 220 + }, + { + "epoch": 0.1785021342646488, + "grad_norm": 0.37919050455093384, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 230 + }, + { + "epoch": 0.18626309662398138, + "grad_norm": 0.33270683884620667, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 240 + }, + { + "epoch": 0.19402405898331393, + "grad_norm": 0.3348783254623413, + "learning_rate": 0.0002, + "loss": 1.7925, + "step": 250 + }, + { + "epoch": 0.20178502134264648, + "grad_norm": 0.3888475298881531, + "learning_rate": 0.0002, + "loss": 1.7774, + "step": 260 + }, + { + "epoch": 0.20954598370197905, + "grad_norm": 0.3554602861404419, + "learning_rate": 0.0002, + "loss": 1.8381, + "step": 270 + }, + { + "epoch": 0.2173069460613116, + "grad_norm": 0.33277708292007446, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 280 + }, + { + "epoch": 0.22506790842064417, + "grad_norm": 0.3281584680080414, + "learning_rate": 0.0002, + "loss": 1.7713, + "step": 290 + }, + { + "epoch": 0.23282887077997672, + "grad_norm": 0.3185969591140747, + "learning_rate": 0.0002, + "loss": 1.8181, + "step": 300 + }, + { + "epoch": 0.24058983313930926, + "grad_norm": 0.35335442423820496, + "learning_rate": 0.0002, + "loss": 1.8595, + "step": 310 + }, + { + "epoch": 0.24835079549864184, + "grad_norm": 0.3119595944881439, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 320 + }, + { + "epoch": 0.2561117578579744, + "grad_norm": 0.36424458026885986, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 330 + }, + { + "epoch": 0.26387272021730696, + "grad_norm": 0.3618951141834259, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 340 + }, + { + "epoch": 0.2716336825766395, + "grad_norm": 0.312757670879364, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 350 + }, + { + "epoch": 0.27939464493597205, + "grad_norm": 0.326016366481781, + "learning_rate": 0.0002, + "loss": 1.9031, + "step": 360 + }, + { + "epoch": 0.2871556072953046, + "grad_norm": 0.34093883633613586, + "learning_rate": 0.0002, + "loss": 1.8214, + "step": 370 + }, + { + "epoch": 0.2949165696546372, + "grad_norm": 0.32325029373168945, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 380 + }, + { + "epoch": 0.30267753201396974, + "grad_norm": 0.34105437994003296, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 390 + }, + { + "epoch": 0.3104384943733023, + "grad_norm": 0.32565295696258545, + "learning_rate": 0.0002, + "loss": 1.7926, + "step": 400 + }, + { + "epoch": 0.31819945673263483, + "grad_norm": 0.32742050290107727, + "learning_rate": 0.0002, + "loss": 1.8031, + "step": 410 + }, + { + "epoch": 0.3259604190919674, + "grad_norm": 0.30233046412467957, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 420 + }, + { + "epoch": 0.3337213814513, + "grad_norm": 0.32419222593307495, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 430 + }, + { + "epoch": 0.3414823438106325, + "grad_norm": 0.3653007745742798, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 440 + }, + { + "epoch": 0.3492433061699651, + "grad_norm": 0.31617099046707153, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 450 + }, + { + "epoch": 0.3570042685292976, + "grad_norm": 0.3305962085723877, + "learning_rate": 0.0002, + "loss": 1.7677, + "step": 460 + }, + { + "epoch": 0.36476523088863017, + "grad_norm": 0.3178933262825012, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 470 + }, + { + "epoch": 0.37252619324796277, + "grad_norm": 0.37163782119750977, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 480 + }, + { + "epoch": 0.3802871556072953, + "grad_norm": 0.469844788312912, + "learning_rate": 0.0002, + "loss": 1.8804, + "step": 490 + }, + { + "epoch": 0.38804811796662786, + "grad_norm": 0.3409338593482971, + "learning_rate": 0.0002, + "loss": 1.8343, + "step": 500 + }, + { + "epoch": 0.3958090803259604, + "grad_norm": 0.31943467259407043, + "learning_rate": 0.0002, + "loss": 1.8433, + "step": 510 + }, + { + "epoch": 0.40357004268529295, + "grad_norm": 0.32293614745140076, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 520 + }, + { + "epoch": 0.41133100504462555, + "grad_norm": 0.2994382977485657, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 530 + }, + { + "epoch": 0.4190919674039581, + "grad_norm": 0.3273141384124756, + "learning_rate": 0.0002, + "loss": 1.8153, + "step": 540 + }, + { + "epoch": 0.42685292976329064, + "grad_norm": 0.3020550012588501, + "learning_rate": 0.0002, + "loss": 1.8097, + "step": 550 + }, + { + "epoch": 0.4346138921226232, + "grad_norm": 0.30113112926483154, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 560 + }, + { + "epoch": 0.44237485448195574, + "grad_norm": 0.30274903774261475, + "learning_rate": 0.0002, + "loss": 1.8084, + "step": 570 + }, + { + "epoch": 0.45013581684128834, + "grad_norm": 0.3231128454208374, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 580 + }, + { + "epoch": 0.4578967792006209, + "grad_norm": 0.3255121409893036, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 590 + }, + { + "epoch": 0.46565774155995343, + "grad_norm": 0.30147507786750793, + "learning_rate": 0.0002, + "loss": 1.8227, + "step": 600 + }, + { + "epoch": 0.473418703919286, + "grad_norm": 0.29781386256217957, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 610 + }, + { + "epoch": 0.4811796662786185, + "grad_norm": 0.30914685130119324, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 620 + }, + { + "epoch": 0.4889406286379511, + "grad_norm": 0.3110593855381012, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 630 + }, + { + "epoch": 0.49670159099728367, + "grad_norm": 0.3298132121562958, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 640 + }, + { + "epoch": 0.5044625533566163, + "grad_norm": 0.322122186422348, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 650 + }, + { + "epoch": 0.5122235157159488, + "grad_norm": 0.3504371643066406, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 660 + }, + { + "epoch": 0.5199844780752814, + "grad_norm": 0.3102182149887085, + "learning_rate": 0.0002, + "loss": 1.8682, + "step": 670 + }, + { + "epoch": 0.5277454404346139, + "grad_norm": 0.6113658547401428, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 680 + }, + { + "epoch": 0.5355064027939465, + "grad_norm": 0.31841862201690674, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 690 + }, + { + "epoch": 0.543267365153279, + "grad_norm": 0.2830526530742645, + "learning_rate": 0.0002, + "loss": 1.7587, + "step": 700 + }, + { + "epoch": 0.5510283275126115, + "grad_norm": 0.3048769533634186, + "learning_rate": 0.0002, + "loss": 1.7887, + "step": 710 + }, + { + "epoch": 0.5587892898719441, + "grad_norm": 0.2719033658504486, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 720 + }, + { + "epoch": 0.5665502522312766, + "grad_norm": 0.3176722526550293, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 730 + }, + { + "epoch": 0.5743112145906092, + "grad_norm": 0.32491734623908997, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 740 + }, + { + "epoch": 0.5820721769499418, + "grad_norm": 0.32746851444244385, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 750 + }, + { + "epoch": 0.5898331393092744, + "grad_norm": 0.3055773973464966, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 760 + }, + { + "epoch": 0.5975941016686069, + "grad_norm": 0.30671584606170654, + "learning_rate": 0.0002, + "loss": 1.8597, + "step": 770 + }, + { + "epoch": 0.6053550640279395, + "grad_norm": 0.28770264983177185, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 780 + }, + { + "epoch": 0.613116026387272, + "grad_norm": 0.2814285457134247, + "learning_rate": 0.0002, + "loss": 1.7025, + "step": 790 + }, + { + "epoch": 0.6208769887466046, + "grad_norm": 0.31554412841796875, + "learning_rate": 0.0002, + "loss": 1.819, + "step": 800 + }, + { + "epoch": 0.6286379511059371, + "grad_norm": 0.2984226942062378, + "learning_rate": 0.0002, + "loss": 1.8335, + "step": 810 + }, + { + "epoch": 0.6363989134652697, + "grad_norm": 0.2859906554222107, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 820 + }, + { + "epoch": 0.6441598758246022, + "grad_norm": 0.2887928783893585, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 830 + }, + { + "epoch": 0.6519208381839348, + "grad_norm": 0.31287339329719543, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 840 + }, + { + "epoch": 0.6596818005432674, + "grad_norm": 0.32064181566238403, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 850 + }, + { + "epoch": 0.6674427629026, + "grad_norm": 0.290981650352478, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 860 + }, + { + "epoch": 0.6752037252619325, + "grad_norm": 0.33060121536254883, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 870 + }, + { + "epoch": 0.682964687621265, + "grad_norm": 0.27032899856567383, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 880 + }, + { + "epoch": 0.6907256499805976, + "grad_norm": 0.29031234979629517, + "learning_rate": 0.0002, + "loss": 1.8423, + "step": 890 + }, + { + "epoch": 0.6984866123399301, + "grad_norm": 0.2845142185688019, + "learning_rate": 0.0002, + "loss": 1.835, + "step": 900 + }, + { + "epoch": 0.7062475746992627, + "grad_norm": 0.8638312816619873, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 910 + }, + { + "epoch": 0.7140085370585952, + "grad_norm": 0.3086668848991394, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 920 + }, + { + "epoch": 0.7217694994179278, + "grad_norm": 0.2724177837371826, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 930 + }, + { + "epoch": 0.7295304617772603, + "grad_norm": 0.289559006690979, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 940 + }, + { + "epoch": 0.737291424136593, + "grad_norm": 0.3000658452510834, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 950 + }, + { + "epoch": 0.7450523864959255, + "grad_norm": 0.33544042706489563, + "learning_rate": 0.0002, + "loss": 1.7736, + "step": 960 + }, + { + "epoch": 0.7528133488552581, + "grad_norm": 0.28593236207962036, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 970 + }, + { + "epoch": 0.7605743112145906, + "grad_norm": 0.313634991645813, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 980 + }, + { + "epoch": 0.7683352735739232, + "grad_norm": 0.2949385941028595, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 990 + }, + { + "epoch": 0.7760962359332557, + "grad_norm": 0.2920108437538147, + "learning_rate": 0.0002, + "loss": 1.8689, + "step": 1000 + }, + { + "epoch": 0.7838571982925883, + "grad_norm": 0.3245100677013397, + "learning_rate": 0.0002, + "loss": 1.8401, + "step": 1010 + }, + { + "epoch": 0.7916181606519208, + "grad_norm": 0.3007619380950928, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 1020 + }, + { + "epoch": 0.7993791230112534, + "grad_norm": 0.3630852997303009, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 1030 + }, + { + "epoch": 0.8071400853705859, + "grad_norm": 0.2856379747390747, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 1040 + }, + { + "epoch": 0.8149010477299186, + "grad_norm": 0.32476478815078735, + "learning_rate": 0.0002, + "loss": 1.8371, + "step": 1050 + }, + { + "epoch": 0.8226620100892511, + "grad_norm": 0.5162565112113953, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 1060 + }, + { + "epoch": 0.8304229724485837, + "grad_norm": 0.316496342420578, + "learning_rate": 0.0002, + "loss": 1.8862, + "step": 1070 + }, + { + "epoch": 0.8381839348079162, + "grad_norm": 0.31977516412734985, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 1080 + }, + { + "epoch": 0.8459448971672487, + "grad_norm": 0.269509494304657, + "learning_rate": 0.0002, + "loss": 1.8547, + "step": 1090 + }, + { + "epoch": 0.8537058595265813, + "grad_norm": 0.31621453166007996, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 1100 + }, + { + "epoch": 0.8614668218859138, + "grad_norm": 0.2946535050868988, + "learning_rate": 0.0002, + "loss": 1.739, + "step": 1110 + }, + { + "epoch": 0.8692277842452464, + "grad_norm": 0.3088909983634949, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 1120 + }, + { + "epoch": 0.8769887466045789, + "grad_norm": 0.33033716678619385, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 1130 + }, + { + "epoch": 0.8847497089639115, + "grad_norm": 0.2954833507537842, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1140 + }, + { + "epoch": 0.8925106713232441, + "grad_norm": 0.2950248122215271, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 1150 + }, + { + "epoch": 0.9002716336825767, + "grad_norm": 0.296661913394928, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 1160 + }, + { + "epoch": 0.9080325960419092, + "grad_norm": 0.35451310873031616, + "learning_rate": 0.0002, + "loss": 1.7967, + "step": 1170 + }, + { + "epoch": 0.9157935584012418, + "grad_norm": 0.32705947756767273, + "learning_rate": 0.0002, + "loss": 1.8202, + "step": 1180 + }, + { + "epoch": 0.9235545207605743, + "grad_norm": 0.3333960771560669, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1190 + }, + { + "epoch": 0.9313154831199069, + "grad_norm": 0.3042232096195221, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 1200 + }, + { + "epoch": 0.9390764454792394, + "grad_norm": 0.281553715467453, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1210 + }, + { + "epoch": 0.946837407838572, + "grad_norm": 0.3096391558647156, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 1220 + }, + { + "epoch": 0.9545983701979045, + "grad_norm": 0.2866271734237671, + "learning_rate": 0.0002, + "loss": 1.7401, + "step": 1230 + }, + { + "epoch": 0.962359332557237, + "grad_norm": 0.28394097089767456, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 1240 + }, + { + "epoch": 0.9701202949165697, + "grad_norm": 0.3249266743659973, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 1250 + }, + { + "epoch": 0.9778812572759022, + "grad_norm": 0.2896869480609894, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1260 + }, + { + "epoch": 0.9856422196352348, + "grad_norm": 0.29224586486816406, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 1270 + }, + { + "epoch": 0.9934031819945673, + "grad_norm": 0.2820223569869995, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1280 + }, + { + "epoch": 0.9996119518820333, + "eval_loss": 1.8081045150756836, + "eval_runtime": 102.3056, + "eval_samples_per_second": 4.956, + "eval_steps_per_second": 0.626, + "step": 1288 + }, + { + "epoch": 1.0011641443538999, + "grad_norm": 0.3282551169395447, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 1290 + }, + { + "epoch": 1.0089251067132325, + "grad_norm": 0.30217495560646057, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1300 + }, + { + "epoch": 1.016686069072565, + "grad_norm": 0.30801767110824585, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 1310 + }, + { + "epoch": 1.0244470314318976, + "grad_norm": 0.31816792488098145, + "learning_rate": 0.0002, + "loss": 1.7756, + "step": 1320 + }, + { + "epoch": 1.03220799379123, + "grad_norm": 0.27794334292411804, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 1330 + }, + { + "epoch": 1.0399689561505627, + "grad_norm": 0.3018926680088043, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 1340 + }, + { + "epoch": 1.0477299185098952, + "grad_norm": 0.3552975356578827, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1350 + }, + { + "epoch": 1.0554908808692278, + "grad_norm": 0.32590144872665405, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 1360 + }, + { + "epoch": 1.0632518432285603, + "grad_norm": 0.3435460925102234, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 1370 + }, + { + "epoch": 1.071012805587893, + "grad_norm": 0.35037797689437866, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 1380 + }, + { + "epoch": 1.0787737679472253, + "grad_norm": 0.31398263573646545, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 1390 + }, + { + "epoch": 1.086534730306558, + "grad_norm": 0.3134010434150696, + "learning_rate": 0.0002, + "loss": 1.6729, + "step": 1400 + }, + { + "epoch": 1.0942956926658907, + "grad_norm": 0.4599704444408417, + "learning_rate": 0.0002, + "loss": 1.751, + "step": 1410 + }, + { + "epoch": 1.102056655025223, + "grad_norm": 0.35852891206741333, + "learning_rate": 0.0002, + "loss": 1.6871, + "step": 1420 + }, + { + "epoch": 1.1098176173845558, + "grad_norm": 0.35628634691238403, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 1430 + }, + { + "epoch": 1.1175785797438882, + "grad_norm": 0.3769161105155945, + "learning_rate": 0.0002, + "loss": 1.6166, + "step": 1440 + }, + { + "epoch": 1.1253395421032208, + "grad_norm": 1.3712416887283325, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1450 + }, + { + "epoch": 1.1331005044625533, + "grad_norm": 0.38406670093536377, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 1460 + }, + { + "epoch": 1.140861466821886, + "grad_norm": 0.3402116000652313, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 1470 + }, + { + "epoch": 1.1486224291812184, + "grad_norm": 0.341189444065094, + "learning_rate": 0.0002, + "loss": 1.7074, + "step": 1480 + }, + { + "epoch": 1.156383391540551, + "grad_norm": 0.36629995703697205, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 1490 + }, + { + "epoch": 1.1641443538998835, + "grad_norm": 0.3499569296836853, + "learning_rate": 0.0002, + "loss": 1.6952, + "step": 1500 + }, + { + "epoch": 1.1719053162592161, + "grad_norm": 0.3663063943386078, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 1510 + }, + { + "epoch": 1.1796662786185488, + "grad_norm": 0.34851500391960144, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1520 + }, + { + "epoch": 1.1874272409778812, + "grad_norm": 0.35071656107902527, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 1530 + }, + { + "epoch": 1.1951882033372139, + "grad_norm": 0.42783796787261963, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 1540 + }, + { + "epoch": 1.2029491656965463, + "grad_norm": 0.31830692291259766, + "learning_rate": 0.0002, + "loss": 1.7499, + "step": 1550 + }, + { + "epoch": 1.210710128055879, + "grad_norm": 0.3597424626350403, + "learning_rate": 0.0002, + "loss": 1.7372, + "step": 1560 + }, + { + "epoch": 1.2184710904152114, + "grad_norm": 0.35233765840530396, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 1570 + }, + { + "epoch": 1.226232052774544, + "grad_norm": 0.35942912101745605, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 1580 + }, + { + "epoch": 1.2339930151338767, + "grad_norm": 0.36159393191337585, + "learning_rate": 0.0002, + "loss": 1.6598, + "step": 1590 + }, + { + "epoch": 1.2417539774932091, + "grad_norm": 0.3328469693660736, + "learning_rate": 0.0002, + "loss": 1.6697, + "step": 1600 + }, + { + "epoch": 1.2495149398525418, + "grad_norm": 0.3089476525783539, + "learning_rate": 0.0002, + "loss": 1.7594, + "step": 1610 + }, + { + "epoch": 1.2572759022118742, + "grad_norm": 0.30947765707969666, + "learning_rate": 0.0002, + "loss": 1.6805, + "step": 1620 + }, + { + "epoch": 1.265036864571207, + "grad_norm": 0.32154011726379395, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 1630 + }, + { + "epoch": 1.2727978269305393, + "grad_norm": 0.3480297923088074, + "learning_rate": 0.0002, + "loss": 1.6621, + "step": 1640 + }, + { + "epoch": 1.280558789289872, + "grad_norm": 0.39471694827079773, + "learning_rate": 0.0002, + "loss": 1.7087, + "step": 1650 + }, + { + "epoch": 1.2883197516492044, + "grad_norm": 0.35728853940963745, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 1660 + }, + { + "epoch": 1.296080714008537, + "grad_norm": 0.35223081707954407, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 1670 + }, + { + "epoch": 1.3038416763678695, + "grad_norm": 0.3588867485523224, + "learning_rate": 0.0002, + "loss": 1.7253, + "step": 1680 + }, + { + "epoch": 1.3116026387272022, + "grad_norm": 0.3528042733669281, + "learning_rate": 0.0002, + "loss": 1.6505, + "step": 1690 + }, + { + "epoch": 1.3193636010865348, + "grad_norm": 0.35975801944732666, + "learning_rate": 0.0002, + "loss": 1.6945, + "step": 1700 + }, + { + "epoch": 1.3271245634458673, + "grad_norm": 0.36691880226135254, + "learning_rate": 0.0002, + "loss": 1.6631, + "step": 1710 + }, + { + "epoch": 1.3348855258052, + "grad_norm": 0.3787977695465088, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1720 + }, + { + "epoch": 1.3426464881645324, + "grad_norm": 0.36614933609962463, + "learning_rate": 0.0002, + "loss": 1.7697, + "step": 1730 + }, + { + "epoch": 1.350407450523865, + "grad_norm": 0.3484745919704437, + "learning_rate": 0.0002, + "loss": 1.6487, + "step": 1740 + }, + { + "epoch": 1.3581684128831975, + "grad_norm": 0.36905673146247864, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 1750 + }, + { + "epoch": 1.36592937524253, + "grad_norm": 0.41564738750457764, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 1760 + }, + { + "epoch": 1.3736903376018628, + "grad_norm": 0.3345205783843994, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 1770 + }, + { + "epoch": 1.3814512999611952, + "grad_norm": 0.34926071763038635, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 1780 + }, + { + "epoch": 1.3892122623205276, + "grad_norm": 0.42004233598709106, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 1790 + }, + { + "epoch": 1.3969732246798603, + "grad_norm": 0.3576236963272095, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 1800 + }, + { + "epoch": 1.404734187039193, + "grad_norm": 0.3586704432964325, + "learning_rate": 0.0002, + "loss": 1.8516, + "step": 1810 + }, + { + "epoch": 1.4124951493985254, + "grad_norm": 0.3943439722061157, + "learning_rate": 0.0002, + "loss": 1.6171, + "step": 1820 + }, + { + "epoch": 1.420256111757858, + "grad_norm": 0.3484877049922943, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 1830 + }, + { + "epoch": 1.4280170741171905, + "grad_norm": 0.3344518840312958, + "learning_rate": 0.0002, + "loss": 1.7205, + "step": 1840 + }, + { + "epoch": 1.4357780364765231, + "grad_norm": 0.4345698356628418, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1850 + }, + { + "epoch": 1.4435389988358556, + "grad_norm": 0.5525162220001221, + "learning_rate": 0.0002, + "loss": 1.6855, + "step": 1860 + }, + { + "epoch": 1.4512999611951882, + "grad_norm": 0.37194496393203735, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1870 + }, + { + "epoch": 1.4590609235545209, + "grad_norm": 0.34570157527923584, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1880 + }, + { + "epoch": 1.4668218859138533, + "grad_norm": 0.3512282073497772, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 1890 + }, + { + "epoch": 1.4745828482731858, + "grad_norm": 0.3443922996520996, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1900 + }, + { + "epoch": 1.4823438106325184, + "grad_norm": 0.3812018036842346, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 1910 + }, + { + "epoch": 1.490104772991851, + "grad_norm": 0.39263492822647095, + "learning_rate": 0.0002, + "loss": 1.7277, + "step": 1920 + }, + { + "epoch": 1.4978657353511835, + "grad_norm": 0.3146156072616577, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1930 + }, + { + "epoch": 1.505626697710516, + "grad_norm": 0.3653988540172577, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 1940 + }, + { + "epoch": 1.5133876600698488, + "grad_norm": 0.3966596722602844, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 1950 + }, + { + "epoch": 1.5211486224291813, + "grad_norm": 0.3441697359085083, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1960 + }, + { + "epoch": 1.5289095847885137, + "grad_norm": 0.3328564465045929, + "learning_rate": 0.0002, + "loss": 1.7175, + "step": 1970 + }, + { + "epoch": 1.5366705471478463, + "grad_norm": 0.34068772196769714, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 1980 + }, + { + "epoch": 1.544431509507179, + "grad_norm": 0.3559795916080475, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 1990 + }, + { + "epoch": 1.5521924718665114, + "grad_norm": 0.37888768315315247, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 2000 + }, + { + "epoch": 1.5599534342258439, + "grad_norm": 0.36128363013267517, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 2010 + }, + { + "epoch": 1.5677143965851765, + "grad_norm": 0.3643714487552643, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 2020 + }, + { + "epoch": 1.5754753589445092, + "grad_norm": 0.3863612115383148, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 2030 + }, + { + "epoch": 1.5832363213038416, + "grad_norm": 0.32831457257270813, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 2040 + }, + { + "epoch": 1.5909972836631743, + "grad_norm": 0.36098113656044006, + "learning_rate": 0.0002, + "loss": 1.7404, + "step": 2050 + }, + { + "epoch": 1.598758246022507, + "grad_norm": 1.1079334020614624, + "learning_rate": 0.0002, + "loss": 1.7065, + "step": 2060 + }, + { + "epoch": 1.6065192083818394, + "grad_norm": 0.35615381598472595, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2070 + }, + { + "epoch": 1.6142801707411718, + "grad_norm": 0.369711309671402, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 2080 + }, + { + "epoch": 1.6220411331005045, + "grad_norm": 0.390658438205719, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 2090 + }, + { + "epoch": 1.6298020954598371, + "grad_norm": 0.3422999382019043, + "learning_rate": 0.0002, + "loss": 1.6996, + "step": 2100 + }, + { + "epoch": 1.6375630578191696, + "grad_norm": 0.372475266456604, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 2110 + }, + { + "epoch": 1.645324020178502, + "grad_norm": 0.35660576820373535, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 2120 + }, + { + "epoch": 1.6530849825378346, + "grad_norm": 0.35754942893981934, + "learning_rate": 0.0002, + "loss": 1.6991, + "step": 2130 + }, + { + "epoch": 1.6608459448971673, + "grad_norm": 0.34572410583496094, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 2140 + }, + { + "epoch": 1.6686069072564997, + "grad_norm": 0.42059701681137085, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 2150 + }, + { + "epoch": 1.6763678696158324, + "grad_norm": 0.35200759768486023, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 2160 + }, + { + "epoch": 1.684128831975165, + "grad_norm": 0.3704029321670532, + "learning_rate": 0.0002, + "loss": 1.6869, + "step": 2170 + }, + { + "epoch": 1.6918897943344975, + "grad_norm": 0.40450501441955566, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 2180 + }, + { + "epoch": 1.69965075669383, + "grad_norm": 0.362966924905777, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2190 + }, + { + "epoch": 1.7074117190531626, + "grad_norm": 0.36586204171180725, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 2200 + }, + { + "epoch": 1.7151726814124952, + "grad_norm": 0.3295372426509857, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 2210 + }, + { + "epoch": 1.7229336437718277, + "grad_norm": 0.3892575800418854, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 2220 + }, + { + "epoch": 1.73069460613116, + "grad_norm": 0.34712135791778564, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 2230 + }, + { + "epoch": 1.738455568490493, + "grad_norm": 0.34801796078681946, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 2240 + }, + { + "epoch": 1.7462165308498254, + "grad_norm": 0.3822397291660309, + "learning_rate": 0.0002, + "loss": 1.7009, + "step": 2250 + }, + { + "epoch": 1.7539774932091579, + "grad_norm": 0.38933250308036804, + "learning_rate": 0.0002, + "loss": 1.6546, + "step": 2260 + }, + { + "epoch": 1.7617384555684905, + "grad_norm": 0.3798373341560364, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 2270 + }, + { + "epoch": 1.7694994179278232, + "grad_norm": 0.35151317715644836, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 2280 + }, + { + "epoch": 1.7772603802871556, + "grad_norm": 0.44981494545936584, + "learning_rate": 0.0002, + "loss": 1.6894, + "step": 2290 + }, + { + "epoch": 1.785021342646488, + "grad_norm": 0.3992624580860138, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 2300 + }, + { + "epoch": 1.7927823050058207, + "grad_norm": 0.3772512376308441, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 2310 + }, + { + "epoch": 1.8005432673651534, + "grad_norm": 0.3511589467525482, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2320 + }, + { + "epoch": 1.8083042297244858, + "grad_norm": 0.3805285394191742, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 2330 + }, + { + "epoch": 1.8160651920838184, + "grad_norm": 0.3792071044445038, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 2340 + }, + { + "epoch": 1.823826154443151, + "grad_norm": 0.36430829763412476, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 2350 + }, + { + "epoch": 1.8315871168024835, + "grad_norm": 0.36502477526664734, + "learning_rate": 0.0002, + "loss": 1.6773, + "step": 2360 + }, + { + "epoch": 1.839348079161816, + "grad_norm": 0.35015153884887695, + "learning_rate": 0.0002, + "loss": 1.8072, + "step": 2370 + }, + { + "epoch": 1.8471090415211486, + "grad_norm": 0.3710903823375702, + "learning_rate": 0.0002, + "loss": 1.7734, + "step": 2380 + }, + { + "epoch": 1.8548700038804813, + "grad_norm": 0.3542828857898712, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 2390 + }, + { + "epoch": 1.8626309662398137, + "grad_norm": 0.35467568039894104, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 2400 + }, + { + "epoch": 1.8703919285991462, + "grad_norm": 0.3638560473918915, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 2410 + }, + { + "epoch": 1.8781528909584788, + "grad_norm": 0.3823298215866089, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 2420 + }, + { + "epoch": 1.8859138533178115, + "grad_norm": 0.3926416337490082, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 2430 + }, + { + "epoch": 1.893674815677144, + "grad_norm": 0.3608079254627228, + "learning_rate": 0.0002, + "loss": 1.71, + "step": 2440 + }, + { + "epoch": 1.9014357780364766, + "grad_norm": 0.3426613509654999, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 2450 + }, + { + "epoch": 1.9091967403958092, + "grad_norm": 0.3522338569164276, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 2460 + }, + { + "epoch": 1.9169577027551417, + "grad_norm": 0.3608049154281616, + "learning_rate": 0.0002, + "loss": 1.7307, + "step": 2470 + }, + { + "epoch": 1.924718665114474, + "grad_norm": 0.3849755525588989, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 2480 + }, + { + "epoch": 1.9324796274738067, + "grad_norm": 0.4154011011123657, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 2490 + }, + { + "epoch": 1.9402405898331394, + "grad_norm": 0.3602796792984009, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 2500 + }, + { + "epoch": 1.9480015521924718, + "grad_norm": 0.3702992796897888, + "learning_rate": 0.0002, + "loss": 1.7843, + "step": 2510 + }, + { + "epoch": 1.9557625145518043, + "grad_norm": 0.3657735288143158, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 2520 + }, + { + "epoch": 1.963523476911137, + "grad_norm": 0.41031739115715027, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2530 + }, + { + "epoch": 1.9712844392704696, + "grad_norm": 0.34578680992126465, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 2540 + }, + { + "epoch": 1.979045401629802, + "grad_norm": 0.3361521065235138, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 2550 + }, + { + "epoch": 1.9868063639891347, + "grad_norm": 0.34342363476753235, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 2560 + }, + { + "epoch": 1.9945673263484673, + "grad_norm": 0.32954007387161255, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 2570 + }, + { + "epoch": 2.0, + "eval_loss": 1.8068748712539673, + "eval_runtime": 105.5885, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 2577 + }, + { + "epoch": 2.0023282887077998, + "grad_norm": 0.336302250623703, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 2580 + }, + { + "epoch": 2.010089251067132, + "grad_norm": 0.3627048432826996, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 2590 + }, + { + "epoch": 2.017850213426465, + "grad_norm": 0.38406702876091003, + "learning_rate": 0.0002, + "loss": 1.4908, + "step": 2600 + }, + { + "epoch": 2.0256111757857975, + "grad_norm": 0.5326781272888184, + "learning_rate": 0.0002, + "loss": 1.5368, + "step": 2610 + }, + { + "epoch": 2.03337213814513, + "grad_norm": 0.4774554967880249, + "learning_rate": 0.0002, + "loss": 1.5727, + "step": 2620 + }, + { + "epoch": 2.0411331005044624, + "grad_norm": 0.4251810312271118, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 2630 + }, + { + "epoch": 2.0488940628637953, + "grad_norm": 0.4693007171154022, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 2640 + }, + { + "epoch": 2.0566550252231277, + "grad_norm": 0.46371519565582275, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 2650 + }, + { + "epoch": 2.06441598758246, + "grad_norm": 0.46652570366859436, + "learning_rate": 0.0002, + "loss": 1.6304, + "step": 2660 + }, + { + "epoch": 2.0721769499417926, + "grad_norm": 0.45200315117836, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 2670 + }, + { + "epoch": 2.0799379123011255, + "grad_norm": 0.42905205488204956, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 2680 + }, + { + "epoch": 2.087698874660458, + "grad_norm": 0.44509148597717285, + "learning_rate": 0.0002, + "loss": 1.5401, + "step": 2690 + }, + { + "epoch": 2.0954598370197903, + "grad_norm": 0.4445319175720215, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 2700 + }, + { + "epoch": 2.103220799379123, + "grad_norm": 0.46825504302978516, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 2710 + }, + { + "epoch": 2.1109817617384556, + "grad_norm": 0.4623856842517853, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 2720 + }, + { + "epoch": 2.118742724097788, + "grad_norm": 0.4833452105522156, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2730 + }, + { + "epoch": 2.1265036864571205, + "grad_norm": 0.4582686722278595, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 2740 + }, + { + "epoch": 2.1342646488164534, + "grad_norm": 0.47587934136390686, + "learning_rate": 0.0002, + "loss": 1.5801, + "step": 2750 + }, + { + "epoch": 2.142025611175786, + "grad_norm": 0.4602217972278595, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 2760 + }, + { + "epoch": 2.1497865735351183, + "grad_norm": 0.47501352429389954, + "learning_rate": 0.0002, + "loss": 1.5271, + "step": 2770 + }, + { + "epoch": 2.1575475358944507, + "grad_norm": 0.5078499913215637, + "learning_rate": 0.0002, + "loss": 1.4862, + "step": 2780 + }, + { + "epoch": 2.1653084982537836, + "grad_norm": 0.497704416513443, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 2790 + }, + { + "epoch": 2.173069460613116, + "grad_norm": 0.5435971617698669, + "learning_rate": 0.0002, + "loss": 1.5597, + "step": 2800 + }, + { + "epoch": 2.1808304229724484, + "grad_norm": 0.5172356367111206, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 2810 + }, + { + "epoch": 2.1885913853317813, + "grad_norm": 0.44063422083854675, + "learning_rate": 0.0002, + "loss": 1.5202, + "step": 2820 + }, + { + "epoch": 2.1963523476911138, + "grad_norm": 0.5079569220542908, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 2830 + }, + { + "epoch": 2.204113310050446, + "grad_norm": 0.45658132433891296, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2840 + }, + { + "epoch": 2.2118742724097786, + "grad_norm": 0.5103023648262024, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 2850 + }, + { + "epoch": 2.2196352347691115, + "grad_norm": 0.4882226288318634, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 2860 + }, + { + "epoch": 2.227396197128444, + "grad_norm": 0.5087296962738037, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 2870 + }, + { + "epoch": 2.2351571594877764, + "grad_norm": 0.45293712615966797, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 2880 + }, + { + "epoch": 2.242918121847109, + "grad_norm": 0.5120379328727722, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 2890 + }, + { + "epoch": 2.2506790842064417, + "grad_norm": 0.47126415371894836, + "learning_rate": 0.0002, + "loss": 1.5273, + "step": 2900 + }, + { + "epoch": 2.258440046565774, + "grad_norm": 0.44005846977233887, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 2910 + }, + { + "epoch": 2.2662010089251066, + "grad_norm": 0.46476176381111145, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 2920 + }, + { + "epoch": 2.2739619712844394, + "grad_norm": 0.48051515221595764, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 2930 + }, + { + "epoch": 2.281722933643772, + "grad_norm": 0.480069637298584, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2940 + }, + { + "epoch": 2.2894838960031043, + "grad_norm": 0.5122102499008179, + "learning_rate": 0.0002, + "loss": 1.5747, + "step": 2950 + }, + { + "epoch": 2.2972448583624367, + "grad_norm": 0.48879891633987427, + "learning_rate": 0.0002, + "loss": 1.5183, + "step": 2960 + }, + { + "epoch": 2.3050058207217696, + "grad_norm": 0.4973136782646179, + "learning_rate": 0.0002, + "loss": 1.5483, + "step": 2970 + }, + { + "epoch": 2.312766783081102, + "grad_norm": 0.5522695183753967, + "learning_rate": 0.0002, + "loss": 1.677, + "step": 2980 + }, + { + "epoch": 2.3205277454404345, + "grad_norm": 0.5220217704772949, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 2990 + }, + { + "epoch": 2.328288707799767, + "grad_norm": 0.4978662431240082, + "learning_rate": 0.0002, + "loss": 1.6299, + "step": 3000 + }, + { + "epoch": 2.3360496701591, + "grad_norm": 0.554053544998169, + "learning_rate": 0.0002, + "loss": 1.5498, + "step": 3010 + }, + { + "epoch": 2.3438106325184322, + "grad_norm": 0.4703886806964874, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 3020 + }, + { + "epoch": 2.3515715948777647, + "grad_norm": 0.5074123740196228, + "learning_rate": 0.0002, + "loss": 1.5418, + "step": 3030 + }, + { + "epoch": 2.3593325572370976, + "grad_norm": 0.5088278651237488, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 3040 + }, + { + "epoch": 2.36709351959643, + "grad_norm": 0.4752114415168762, + "learning_rate": 0.0002, + "loss": 1.5249, + "step": 3050 + }, + { + "epoch": 2.3748544819557624, + "grad_norm": 0.5121659636497498, + "learning_rate": 0.0002, + "loss": 1.5353, + "step": 3060 + }, + { + "epoch": 2.3826154443150953, + "grad_norm": 0.48649218678474426, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 3070 + }, + { + "epoch": 2.3903764066744277, + "grad_norm": 0.5209488868713379, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 3080 + }, + { + "epoch": 2.39813736903376, + "grad_norm": 0.5110517740249634, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 3090 + }, + { + "epoch": 2.4058983313930926, + "grad_norm": 0.5609337091445923, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3100 + }, + { + "epoch": 2.4136592937524255, + "grad_norm": 0.5191826224327087, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 3110 + }, + { + "epoch": 2.421420256111758, + "grad_norm": 0.4876069724559784, + "learning_rate": 0.0002, + "loss": 1.6347, + "step": 3120 + }, + { + "epoch": 2.4291812184710904, + "grad_norm": 0.4713933765888214, + "learning_rate": 0.0002, + "loss": 1.5565, + "step": 3130 + }, + { + "epoch": 2.436942180830423, + "grad_norm": 0.5102227330207825, + "learning_rate": 0.0002, + "loss": 1.6388, + "step": 3140 + }, + { + "epoch": 2.4447031431897557, + "grad_norm": 0.44546666741371155, + "learning_rate": 0.0002, + "loss": 1.5667, + "step": 3150 + }, + { + "epoch": 2.452464105549088, + "grad_norm": 0.5167558193206787, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3160 + }, + { + "epoch": 2.4602250679084205, + "grad_norm": 0.5226958990097046, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 3170 + }, + { + "epoch": 2.4679860302677534, + "grad_norm": 0.4751799702644348, + "learning_rate": 0.0002, + "loss": 1.5758, + "step": 3180 + }, + { + "epoch": 2.475746992627086, + "grad_norm": 0.4744729697704315, + "learning_rate": 0.0002, + "loss": 1.6234, + "step": 3190 + }, + { + "epoch": 2.4835079549864183, + "grad_norm": 0.5203230381011963, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 3200 + }, + { + "epoch": 2.4912689173457507, + "grad_norm": 0.47209781408309937, + "learning_rate": 0.0002, + "loss": 1.493, + "step": 3210 + }, + { + "epoch": 2.4990298797050836, + "grad_norm": 0.5241674780845642, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3220 + }, + { + "epoch": 2.506790842064416, + "grad_norm": 0.5152244567871094, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 3230 + }, + { + "epoch": 2.5145518044237485, + "grad_norm": 0.5216741561889648, + "learning_rate": 0.0002, + "loss": 1.6248, + "step": 3240 + }, + { + "epoch": 2.522312766783081, + "grad_norm": 0.4953259527683258, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 3250 + }, + { + "epoch": 2.530073729142414, + "grad_norm": 0.5973829030990601, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 3260 + }, + { + "epoch": 2.5378346915017462, + "grad_norm": 0.48804202675819397, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 3270 + }, + { + "epoch": 2.5455956538610787, + "grad_norm": 0.5334644317626953, + "learning_rate": 0.0002, + "loss": 1.4954, + "step": 3280 + }, + { + "epoch": 2.5533566162204115, + "grad_norm": 0.46873313188552856, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 3290 + }, + { + "epoch": 2.561117578579744, + "grad_norm": 0.4282589554786682, + "learning_rate": 0.0002, + "loss": 1.5362, + "step": 3300 + }, + { + "epoch": 2.5688785409390764, + "grad_norm": 0.4848293960094452, + "learning_rate": 0.0002, + "loss": 1.6278, + "step": 3310 + }, + { + "epoch": 2.576639503298409, + "grad_norm": 0.5093745589256287, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 3320 + }, + { + "epoch": 2.5844004656577413, + "grad_norm": 0.5084842443466187, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 3330 + }, + { + "epoch": 2.592161428017074, + "grad_norm": 0.4696281850337982, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 3340 + }, + { + "epoch": 2.5999223903764066, + "grad_norm": 0.5767765641212463, + "learning_rate": 0.0002, + "loss": 1.5359, + "step": 3350 + }, + { + "epoch": 2.607683352735739, + "grad_norm": 0.47300875186920166, + "learning_rate": 0.0002, + "loss": 1.6097, + "step": 3360 + }, + { + "epoch": 2.615444315095072, + "grad_norm": 0.4809158146381378, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 3370 + }, + { + "epoch": 2.6232052774544043, + "grad_norm": 0.5141063928604126, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 3380 + }, + { + "epoch": 2.630966239813737, + "grad_norm": 0.4832935035228729, + "learning_rate": 0.0002, + "loss": 1.5784, + "step": 3390 + }, + { + "epoch": 2.6387272021730697, + "grad_norm": 0.5044625401496887, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 3400 + }, + { + "epoch": 2.646488164532402, + "grad_norm": 0.5287680625915527, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 3410 + }, + { + "epoch": 2.6542491268917345, + "grad_norm": 0.5306379795074463, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 3420 + }, + { + "epoch": 2.662010089251067, + "grad_norm": 0.5849291682243347, + "learning_rate": 0.0002, + "loss": 1.5264, + "step": 3430 + }, + { + "epoch": 2.6697710516104, + "grad_norm": 0.7951080799102783, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 3440 + }, + { + "epoch": 2.6775320139697323, + "grad_norm": 0.48087653517723083, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 3450 + }, + { + "epoch": 2.6852929763290647, + "grad_norm": 0.5396431684494019, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 3460 + }, + { + "epoch": 2.693053938688397, + "grad_norm": 0.5481634736061096, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 3470 + }, + { + "epoch": 2.70081490104773, + "grad_norm": 0.5068731307983398, + "learning_rate": 0.0002, + "loss": 1.6436, + "step": 3480 + }, + { + "epoch": 2.7085758634070625, + "grad_norm": 0.5759826898574829, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 3490 + }, + { + "epoch": 2.716336825766395, + "grad_norm": 0.7253932952880859, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 3500 + }, + { + "epoch": 2.724097788125728, + "grad_norm": 0.527745246887207, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 3510 + }, + { + "epoch": 2.73185875048506, + "grad_norm": 0.5279242396354675, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 3520 + }, + { + "epoch": 2.7396197128443927, + "grad_norm": 0.5047839283943176, + "learning_rate": 0.0002, + "loss": 1.6768, + "step": 3530 + }, + { + "epoch": 2.7473806752037255, + "grad_norm": 0.5430883169174194, + "learning_rate": 0.0002, + "loss": 1.5517, + "step": 3540 + }, + { + "epoch": 2.755141637563058, + "grad_norm": 0.4496723711490631, + "learning_rate": 0.0002, + "loss": 1.5624, + "step": 3550 + }, + { + "epoch": 2.7629025999223904, + "grad_norm": 0.5063338875770569, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 3560 + }, + { + "epoch": 2.770663562281723, + "grad_norm": 0.4619026780128479, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 3570 + }, + { + "epoch": 2.7784245246410553, + "grad_norm": 0.4753304123878479, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 3580 + }, + { + "epoch": 2.786185487000388, + "grad_norm": 0.5422708988189697, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 3590 + }, + { + "epoch": 2.7939464493597206, + "grad_norm": 0.4756578803062439, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 3600 + }, + { + "epoch": 2.801707411719053, + "grad_norm": 0.5057567358016968, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 3610 + }, + { + "epoch": 2.809468374078386, + "grad_norm": 0.5410919785499573, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 3620 + }, + { + "epoch": 2.8172293364377183, + "grad_norm": 0.4958136975765228, + "learning_rate": 0.0002, + "loss": 1.5573, + "step": 3630 + }, + { + "epoch": 2.8249902987970508, + "grad_norm": 0.454527348279953, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 3640 + }, + { + "epoch": 2.8327512611563836, + "grad_norm": 0.5092706084251404, + "learning_rate": 0.0002, + "loss": 1.5582, + "step": 3650 + }, + { + "epoch": 2.840512223515716, + "grad_norm": 0.5314022302627563, + "learning_rate": 0.0002, + "loss": 1.5893, + "step": 3660 + }, + { + "epoch": 2.8482731858750485, + "grad_norm": 0.5028239488601685, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 3670 + }, + { + "epoch": 2.856034148234381, + "grad_norm": 0.5127444863319397, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 3680 + }, + { + "epoch": 2.8637951105937134, + "grad_norm": 0.5045645236968994, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 3690 + }, + { + "epoch": 2.8715560729530463, + "grad_norm": 0.5560781955718994, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 3700 + }, + { + "epoch": 2.8793170353123787, + "grad_norm": 0.5177600383758545, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 3710 + }, + { + "epoch": 2.887077997671711, + "grad_norm": 0.45830899477005005, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 3720 + }, + { + "epoch": 2.894838960031044, + "grad_norm": 0.4828629195690155, + "learning_rate": 0.0002, + "loss": 1.6344, + "step": 3730 + }, + { + "epoch": 2.9025999223903765, + "grad_norm": 0.48241183161735535, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 3740 + }, + { + "epoch": 2.910360884749709, + "grad_norm": 0.4909592568874359, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 3750 + }, + { + "epoch": 2.9181218471090418, + "grad_norm": 0.44677025079727173, + "learning_rate": 0.0002, + "loss": 1.4927, + "step": 3760 + }, + { + "epoch": 2.925882809468374, + "grad_norm": 0.4928834140300751, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 3770 + }, + { + "epoch": 2.9336437718277066, + "grad_norm": 0.5673553347587585, + "learning_rate": 0.0002, + "loss": 1.5843, + "step": 3780 + }, + { + "epoch": 2.941404734187039, + "grad_norm": 0.548190712928772, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 3790 + }, + { + "epoch": 2.9491656965463715, + "grad_norm": 0.48979803919792175, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 3800 + }, + { + "epoch": 2.9569266589057044, + "grad_norm": 0.533191978931427, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 3810 + }, + { + "epoch": 2.964687621265037, + "grad_norm": 0.5362946391105652, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 3820 + }, + { + "epoch": 2.9724485836243693, + "grad_norm": 0.4724906384944916, + "learning_rate": 0.0002, + "loss": 1.6602, + "step": 3830 + }, + { + "epoch": 2.980209545983702, + "grad_norm": 0.5468461513519287, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 3840 + }, + { + "epoch": 2.9879705083430346, + "grad_norm": 0.4697108864784241, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 3850 + }, + { + "epoch": 2.995731470702367, + "grad_norm": 0.4780906140804291, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 3860 + }, + { + "epoch": 2.9996119518820334, + "eval_loss": 1.8472607135772705, + "eval_runtime": 106.5541, + "eval_samples_per_second": 4.758, + "eval_steps_per_second": 0.601, + "step": 3865 + }, + { + "epoch": 3.0034924330616994, + "grad_norm": 0.5645653605461121, + "learning_rate": 0.0002, + "loss": 1.4983, + "step": 3870 + }, + { + "epoch": 3.0112533954210323, + "grad_norm": 0.6457151174545288, + "learning_rate": 0.0002, + "loss": 1.4334, + "step": 3880 + }, + { + "epoch": 3.0190143577803648, + "grad_norm": 0.583838164806366, + "learning_rate": 0.0002, + "loss": 1.3899, + "step": 3890 + }, + { + "epoch": 3.026775320139697, + "grad_norm": 0.6819260120391846, + "learning_rate": 0.0002, + "loss": 1.3258, + "step": 3900 + }, + { + "epoch": 3.03453628249903, + "grad_norm": 0.6692903637886047, + "learning_rate": 0.0002, + "loss": 1.3458, + "step": 3910 + }, + { + "epoch": 3.0422972448583625, + "grad_norm": 0.6101024746894836, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 3920 + }, + { + "epoch": 3.050058207217695, + "grad_norm": 0.7014093399047852, + "learning_rate": 0.0002, + "loss": 1.394, + "step": 3930 + }, + { + "epoch": 3.0578191695770274, + "grad_norm": 0.7380381226539612, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 3940 + }, + { + "epoch": 3.0655801319363603, + "grad_norm": 0.6607900857925415, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 3950 + }, + { + "epoch": 3.0733410942956927, + "grad_norm": 0.735263466835022, + "learning_rate": 0.0002, + "loss": 1.4293, + "step": 3960 + }, + { + "epoch": 3.081102056655025, + "grad_norm": 0.6788513660430908, + "learning_rate": 0.0002, + "loss": 1.3966, + "step": 3970 + }, + { + "epoch": 3.088863019014358, + "grad_norm": 0.6347652673721313, + "learning_rate": 0.0002, + "loss": 1.3435, + "step": 3980 + }, + { + "epoch": 3.0966239813736904, + "grad_norm": 0.7056642770767212, + "learning_rate": 0.0002, + "loss": 1.4518, + "step": 3990 + }, + { + "epoch": 3.104384943733023, + "grad_norm": 0.6387075185775757, + "learning_rate": 0.0002, + "loss": 1.4474, + "step": 4000 + }, + { + "epoch": 3.1121459060923553, + "grad_norm": 0.6701116561889648, + "learning_rate": 0.0002, + "loss": 1.3833, + "step": 4010 + }, + { + "epoch": 3.119906868451688, + "grad_norm": 0.7558449506759644, + "learning_rate": 0.0002, + "loss": 1.404, + "step": 4020 + }, + { + "epoch": 3.1276678308110206, + "grad_norm": 0.6612881422042847, + "learning_rate": 0.0002, + "loss": 1.3294, + "step": 4030 + }, + { + "epoch": 3.135428793170353, + "grad_norm": 0.7474587559700012, + "learning_rate": 0.0002, + "loss": 1.439, + "step": 4040 + }, + { + "epoch": 3.1431897555296855, + "grad_norm": 0.7292373776435852, + "learning_rate": 0.0002, + "loss": 1.4616, + "step": 4050 + }, + { + "epoch": 3.1509507178890184, + "grad_norm": 0.7432886958122253, + "learning_rate": 0.0002, + "loss": 1.3908, + "step": 4060 + }, + { + "epoch": 3.158711680248351, + "grad_norm": 0.6366098523139954, + "learning_rate": 0.0002, + "loss": 1.4214, + "step": 4070 + }, + { + "epoch": 3.1664726426076832, + "grad_norm": 0.6837611794471741, + "learning_rate": 0.0002, + "loss": 1.5044, + "step": 4080 + }, + { + "epoch": 3.174233604967016, + "grad_norm": 0.7194393277168274, + "learning_rate": 0.0002, + "loss": 1.4332, + "step": 4090 + }, + { + "epoch": 3.1819945673263486, + "grad_norm": 0.6963607668876648, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 4100 + }, + { + "epoch": 3.189755529685681, + "grad_norm": 0.6404902935028076, + "learning_rate": 0.0002, + "loss": 1.4127, + "step": 4110 + }, + { + "epoch": 3.1975164920450134, + "grad_norm": 0.7172070741653442, + "learning_rate": 0.0002, + "loss": 1.4394, + "step": 4120 + }, + { + "epoch": 3.2052774544043463, + "grad_norm": 0.6577759385108948, + "learning_rate": 0.0002, + "loss": 1.4658, + "step": 4130 + }, + { + "epoch": 3.2130384167636787, + "grad_norm": 0.6658480167388916, + "learning_rate": 0.0002, + "loss": 1.4019, + "step": 4140 + }, + { + "epoch": 3.220799379123011, + "grad_norm": 0.6771699786186218, + "learning_rate": 0.0002, + "loss": 1.4348, + "step": 4150 + }, + { + "epoch": 3.2285603414823436, + "grad_norm": 0.699035108089447, + "learning_rate": 0.0002, + "loss": 1.4736, + "step": 4160 + }, + { + "epoch": 3.2363213038416765, + "grad_norm": 0.7218514680862427, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 4170 + }, + { + "epoch": 3.244082266201009, + "grad_norm": 0.6270631551742554, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 4180 + }, + { + "epoch": 3.2518432285603414, + "grad_norm": 0.6828921437263489, + "learning_rate": 0.0002, + "loss": 1.4076, + "step": 4190 + }, + { + "epoch": 3.2596041909196742, + "grad_norm": 0.6005498170852661, + "learning_rate": 0.0002, + "loss": 1.4663, + "step": 4200 + }, + { + "epoch": 3.2673651532790067, + "grad_norm": 0.6974790692329407, + "learning_rate": 0.0002, + "loss": 1.4798, + "step": 4210 + }, + { + "epoch": 3.275126115638339, + "grad_norm": 0.7269543409347534, + "learning_rate": 0.0002, + "loss": 1.5012, + "step": 4220 + }, + { + "epoch": 3.2828870779976715, + "grad_norm": 0.6728787422180176, + "learning_rate": 0.0002, + "loss": 1.3848, + "step": 4230 + }, + { + "epoch": 3.2906480403570044, + "grad_norm": 0.676972508430481, + "learning_rate": 0.0002, + "loss": 1.4112, + "step": 4240 + }, + { + "epoch": 3.298409002716337, + "grad_norm": 0.748309314250946, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 4250 + }, + { + "epoch": 3.3061699650756693, + "grad_norm": 0.6976589560508728, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 4260 + }, + { + "epoch": 3.3139309274350017, + "grad_norm": 0.649780809879303, + "learning_rate": 0.0002, + "loss": 1.3967, + "step": 4270 + }, + { + "epoch": 3.3216918897943346, + "grad_norm": 0.6529902815818787, + "learning_rate": 0.0002, + "loss": 1.327, + "step": 4280 + }, + { + "epoch": 3.329452852153667, + "grad_norm": 0.9273163676261902, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 4290 + }, + { + "epoch": 3.3372138145129995, + "grad_norm": 0.717024028301239, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 4300 + }, + { + "epoch": 3.3449747768723324, + "grad_norm": 0.7914950251579285, + "learning_rate": 0.0002, + "loss": 1.4441, + "step": 4310 + }, + { + "epoch": 3.352735739231665, + "grad_norm": 0.7133203148841858, + "learning_rate": 0.0002, + "loss": 1.432, + "step": 4320 + }, + { + "epoch": 3.3604967015909972, + "grad_norm": 0.7409568428993225, + "learning_rate": 0.0002, + "loss": 1.4662, + "step": 4330 + }, + { + "epoch": 3.3682576639503297, + "grad_norm": 0.6993981003761292, + "learning_rate": 0.0002, + "loss": 1.3992, + "step": 4340 + }, + { + "epoch": 3.3760186263096625, + "grad_norm": 0.7114535570144653, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 4350 + }, + { + "epoch": 3.383779588668995, + "grad_norm": 0.6790860295295715, + "learning_rate": 0.0002, + "loss": 1.4227, + "step": 4360 + }, + { + "epoch": 3.3915405510283274, + "grad_norm": 0.6507849097251892, + "learning_rate": 0.0002, + "loss": 1.4128, + "step": 4370 + }, + { + "epoch": 3.39930151338766, + "grad_norm": 0.5967804193496704, + "learning_rate": 0.0002, + "loss": 1.4559, + "step": 4380 + }, + { + "epoch": 3.4070624757469927, + "grad_norm": 0.6625847816467285, + "learning_rate": 0.0002, + "loss": 1.3687, + "step": 4390 + }, + { + "epoch": 3.414823438106325, + "grad_norm": 0.6736508011817932, + "learning_rate": 0.0002, + "loss": 1.4193, + "step": 4400 + }, + { + "epoch": 3.4225844004656576, + "grad_norm": 0.7870860695838928, + "learning_rate": 0.0002, + "loss": 1.4363, + "step": 4410 + }, + { + "epoch": 3.4303453628249905, + "grad_norm": 0.7205295562744141, + "learning_rate": 0.0002, + "loss": 1.4114, + "step": 4420 + }, + { + "epoch": 3.438106325184323, + "grad_norm": 0.6634634137153625, + "learning_rate": 0.0002, + "loss": 1.4131, + "step": 4430 + }, + { + "epoch": 3.4458672875436553, + "grad_norm": 0.7562733292579651, + "learning_rate": 0.0002, + "loss": 1.4683, + "step": 4440 + }, + { + "epoch": 3.453628249902988, + "grad_norm": 0.6585879921913147, + "learning_rate": 0.0002, + "loss": 1.3486, + "step": 4450 + }, + { + "epoch": 3.4613892122623207, + "grad_norm": 0.6896792054176331, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 4460 + }, + { + "epoch": 3.469150174621653, + "grad_norm": 0.6520342230796814, + "learning_rate": 0.0002, + "loss": 1.4208, + "step": 4470 + }, + { + "epoch": 3.4769111369809855, + "grad_norm": 0.6760806441307068, + "learning_rate": 0.0002, + "loss": 1.3423, + "step": 4480 + }, + { + "epoch": 3.484672099340318, + "grad_norm": 0.7539774179458618, + "learning_rate": 0.0002, + "loss": 1.4398, + "step": 4490 + }, + { + "epoch": 3.492433061699651, + "grad_norm": 0.7409411668777466, + "learning_rate": 0.0002, + "loss": 1.4534, + "step": 4500 + }, + { + "epoch": 3.5001940240589833, + "grad_norm": 0.6876253485679626, + "learning_rate": 0.0002, + "loss": 1.4069, + "step": 4510 + }, + { + "epoch": 3.5079549864183157, + "grad_norm": 0.7028461694717407, + "learning_rate": 0.0002, + "loss": 1.4228, + "step": 4520 + }, + { + "epoch": 3.5157159487776486, + "grad_norm": 0.8056529760360718, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 4530 + }, + { + "epoch": 3.523476911136981, + "grad_norm": 0.711338996887207, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 4540 + }, + { + "epoch": 3.5312378734963135, + "grad_norm": 0.7343552708625793, + "learning_rate": 0.0002, + "loss": 1.5247, + "step": 4550 + }, + { + "epoch": 3.5389988358556463, + "grad_norm": 0.745479941368103, + "learning_rate": 0.0002, + "loss": 1.4308, + "step": 4560 + }, + { + "epoch": 3.5467597982149788, + "grad_norm": 0.7582294940948486, + "learning_rate": 0.0002, + "loss": 1.4229, + "step": 4570 + }, + { + "epoch": 3.554520760574311, + "grad_norm": 0.6717444658279419, + "learning_rate": 0.0002, + "loss": 1.4127, + "step": 4580 + }, + { + "epoch": 3.5622817229336436, + "grad_norm": 0.7417883276939392, + "learning_rate": 0.0002, + "loss": 1.4368, + "step": 4590 + }, + { + "epoch": 3.570042685292976, + "grad_norm": 0.6385737061500549, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 4600 + }, + { + "epoch": 3.577803647652309, + "grad_norm": 0.716704249382019, + "learning_rate": 0.0002, + "loss": 1.3981, + "step": 4610 + }, + { + "epoch": 3.5855646100116414, + "grad_norm": 0.6948980093002319, + "learning_rate": 0.0002, + "loss": 1.3889, + "step": 4620 + }, + { + "epoch": 3.593325572370974, + "grad_norm": 0.6961140036582947, + "learning_rate": 0.0002, + "loss": 1.5177, + "step": 4630 + }, + { + "epoch": 3.6010865347303067, + "grad_norm": 0.7493122220039368, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 4640 + }, + { + "epoch": 3.608847497089639, + "grad_norm": 0.7431658506393433, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 4650 + }, + { + "epoch": 3.6166084594489716, + "grad_norm": 0.8353387713432312, + "learning_rate": 0.0002, + "loss": 1.4551, + "step": 4660 + }, + { + "epoch": 3.6243694218083045, + "grad_norm": 0.7095612287521362, + "learning_rate": 0.0002, + "loss": 1.4533, + "step": 4670 + }, + { + "epoch": 3.632130384167637, + "grad_norm": 0.776620090007782, + "learning_rate": 0.0002, + "loss": 1.4003, + "step": 4680 + }, + { + "epoch": 3.6398913465269693, + "grad_norm": 0.7198925018310547, + "learning_rate": 0.0002, + "loss": 1.4361, + "step": 4690 + }, + { + "epoch": 3.6476523088863018, + "grad_norm": 0.8238834738731384, + "learning_rate": 0.0002, + "loss": 1.4543, + "step": 4700 + }, + { + "epoch": 3.655413271245634, + "grad_norm": 0.6804245710372925, + "learning_rate": 0.0002, + "loss": 1.3958, + "step": 4710 + }, + { + "epoch": 3.663174233604967, + "grad_norm": 0.8444845676422119, + "learning_rate": 0.0002, + "loss": 1.4158, + "step": 4720 + }, + { + "epoch": 3.6709351959642995, + "grad_norm": 0.743797779083252, + "learning_rate": 0.0002, + "loss": 1.3825, + "step": 4730 + }, + { + "epoch": 3.678696158323632, + "grad_norm": 0.8994188904762268, + "learning_rate": 0.0002, + "loss": 1.4213, + "step": 4740 + }, + { + "epoch": 3.686457120682965, + "grad_norm": 0.75416100025177, + "learning_rate": 0.0002, + "loss": 1.4281, + "step": 4750 + }, + { + "epoch": 3.6942180830422973, + "grad_norm": 0.6499266028404236, + "learning_rate": 0.0002, + "loss": 1.4154, + "step": 4760 + }, + { + "epoch": 3.7019790454016297, + "grad_norm": 0.7246791124343872, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 4770 + }, + { + "epoch": 3.7097400077609626, + "grad_norm": 0.7831124067306519, + "learning_rate": 0.0002, + "loss": 1.426, + "step": 4780 + }, + { + "epoch": 3.717500970120295, + "grad_norm": 0.7130028009414673, + "learning_rate": 0.0002, + "loss": 1.3933, + "step": 4790 + }, + { + "epoch": 3.7252619324796274, + "grad_norm": 0.7501602172851562, + "learning_rate": 0.0002, + "loss": 1.4632, + "step": 4800 + }, + { + "epoch": 3.73302289483896, + "grad_norm": 0.6980932950973511, + "learning_rate": 0.0002, + "loss": 1.4985, + "step": 4810 + }, + { + "epoch": 3.7407838571982923, + "grad_norm": 0.8050530552864075, + "learning_rate": 0.0002, + "loss": 1.4517, + "step": 4820 + }, + { + "epoch": 3.748544819557625, + "grad_norm": 0.6385579705238342, + "learning_rate": 0.0002, + "loss": 1.4703, + "step": 4830 + }, + { + "epoch": 3.7563057819169576, + "grad_norm": 0.6664714813232422, + "learning_rate": 0.0002, + "loss": 1.5281, + "step": 4840 + }, + { + "epoch": 3.76406674427629, + "grad_norm": 0.7125676274299622, + "learning_rate": 0.0002, + "loss": 1.4443, + "step": 4850 + }, + { + "epoch": 3.771827706635623, + "grad_norm": 0.7231866717338562, + "learning_rate": 0.0002, + "loss": 1.3958, + "step": 4860 + }, + { + "epoch": 3.7795886689949554, + "grad_norm": 0.6917183995246887, + "learning_rate": 0.0002, + "loss": 1.4446, + "step": 4870 + }, + { + "epoch": 3.787349631354288, + "grad_norm": 0.665037989616394, + "learning_rate": 0.0002, + "loss": 1.4369, + "step": 4880 + }, + { + "epoch": 3.7951105937136207, + "grad_norm": 0.5837726593017578, + "learning_rate": 0.0002, + "loss": 1.4193, + "step": 4890 + }, + { + "epoch": 3.802871556072953, + "grad_norm": 0.6366701722145081, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 4900 + }, + { + "epoch": 3.8106325184322856, + "grad_norm": 0.7082223892211914, + "learning_rate": 0.0002, + "loss": 1.46, + "step": 4910 + }, + { + "epoch": 3.818393480791618, + "grad_norm": 0.8101672530174255, + "learning_rate": 0.0002, + "loss": 1.5139, + "step": 4920 + }, + { + "epoch": 3.826154443150951, + "grad_norm": 0.7516148090362549, + "learning_rate": 0.0002, + "loss": 1.3659, + "step": 4930 + }, + { + "epoch": 3.8339154055102833, + "grad_norm": 0.7928489446640015, + "learning_rate": 0.0002, + "loss": 1.3909, + "step": 4940 + }, + { + "epoch": 3.8416763678696157, + "grad_norm": 0.6892234683036804, + "learning_rate": 0.0002, + "loss": 1.4255, + "step": 4950 + }, + { + "epoch": 3.849437330228948, + "grad_norm": 0.6381304264068604, + "learning_rate": 0.0002, + "loss": 1.5024, + "step": 4960 + }, + { + "epoch": 3.857198292588281, + "grad_norm": 0.8068831562995911, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 4970 + }, + { + "epoch": 3.8649592549476135, + "grad_norm": 0.7289869785308838, + "learning_rate": 0.0002, + "loss": 1.45, + "step": 4980 + }, + { + "epoch": 3.872720217306946, + "grad_norm": 0.7278549075126648, + "learning_rate": 0.0002, + "loss": 1.398, + "step": 4990 + }, + { + "epoch": 3.880481179666279, + "grad_norm": 0.7324236631393433, + "learning_rate": 0.0002, + "loss": 1.4442, + "step": 5000 + }, + { + "epoch": 3.8882421420256112, + "grad_norm": 0.6759871244430542, + "learning_rate": 0.0002, + "loss": 1.4511, + "step": 5010 + }, + { + "epoch": 3.8960031043849437, + "grad_norm": 0.8159207701683044, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 5020 + }, + { + "epoch": 3.9037640667442766, + "grad_norm": 0.6536211967468262, + "learning_rate": 0.0002, + "loss": 1.4685, + "step": 5030 + }, + { + "epoch": 3.911525029103609, + "grad_norm": 0.6827932000160217, + "learning_rate": 0.0002, + "loss": 1.4335, + "step": 5040 + }, + { + "epoch": 3.9192859914629414, + "grad_norm": 0.6688340306282043, + "learning_rate": 0.0002, + "loss": 1.433, + "step": 5050 + }, + { + "epoch": 3.927046953822274, + "grad_norm": 0.6385695934295654, + "learning_rate": 0.0002, + "loss": 1.4099, + "step": 5060 + }, + { + "epoch": 3.9348079161816063, + "grad_norm": 0.6975107192993164, + "learning_rate": 0.0002, + "loss": 1.4767, + "step": 5070 + }, + { + "epoch": 3.942568878540939, + "grad_norm": 0.6684112548828125, + "learning_rate": 0.0002, + "loss": 1.4893, + "step": 5080 + }, + { + "epoch": 3.9503298409002716, + "grad_norm": 0.8349628448486328, + "learning_rate": 0.0002, + "loss": 1.4732, + "step": 5090 + }, + { + "epoch": 3.958090803259604, + "grad_norm": 0.7146425843238831, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 5100 + }, + { + "epoch": 3.965851765618937, + "grad_norm": 0.6555036902427673, + "learning_rate": 0.0002, + "loss": 1.4149, + "step": 5110 + }, + { + "epoch": 3.9736127279782694, + "grad_norm": 0.7037415504455566, + "learning_rate": 0.0002, + "loss": 1.4274, + "step": 5120 + }, + { + "epoch": 3.981373690337602, + "grad_norm": 0.7235575914382935, + "learning_rate": 0.0002, + "loss": 1.4292, + "step": 5130 + }, + { + "epoch": 3.9891346526969347, + "grad_norm": 0.7092325687408447, + "learning_rate": 0.0002, + "loss": 1.4455, + "step": 5140 + }, + { + "epoch": 3.996895615056267, + "grad_norm": 0.7490319609642029, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 5150 + }, + { + "epoch": 4.0, + "eval_loss": 1.9131355285644531, + "eval_runtime": 105.5778, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 5154 + }, + { + "epoch": 4.0046565774155995, + "grad_norm": 0.7075854539871216, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 5160 + }, + { + "epoch": 4.012417539774932, + "grad_norm": 0.9466007351875305, + "learning_rate": 0.0002, + "loss": 1.209, + "step": 5170 + }, + { + "epoch": 4.020178502134264, + "grad_norm": 1.0297044515609741, + "learning_rate": 0.0002, + "loss": 1.2567, + "step": 5180 + }, + { + "epoch": 4.027939464493597, + "grad_norm": 0.7765059471130371, + "learning_rate": 0.0002, + "loss": 1.1796, + "step": 5190 + }, + { + "epoch": 4.03570042685293, + "grad_norm": 0.995760977268219, + "learning_rate": 0.0002, + "loss": 1.2356, + "step": 5200 + }, + { + "epoch": 4.043461389212262, + "grad_norm": 0.8663829565048218, + "learning_rate": 0.0002, + "loss": 1.1792, + "step": 5210 + }, + { + "epoch": 4.051222351571595, + "grad_norm": 1.0660825967788696, + "learning_rate": 0.0002, + "loss": 1.2471, + "step": 5220 + }, + { + "epoch": 4.058983313930927, + "grad_norm": 0.9858174920082092, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 5230 + }, + { + "epoch": 4.06674427629026, + "grad_norm": 0.8911338448524475, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 5240 + }, + { + "epoch": 4.074505238649593, + "grad_norm": 1.0848394632339478, + "learning_rate": 0.0002, + "loss": 1.1858, + "step": 5250 + }, + { + "epoch": 4.082266201008925, + "grad_norm": 1.0849905014038086, + "learning_rate": 0.0002, + "loss": 1.1684, + "step": 5260 + }, + { + "epoch": 4.090027163368258, + "grad_norm": 1.0497841835021973, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 5270 + }, + { + "epoch": 4.0977881257275905, + "grad_norm": 0.8943053483963013, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 5280 + }, + { + "epoch": 4.1055490880869225, + "grad_norm": 0.8432527184486389, + "learning_rate": 0.0002, + "loss": 1.1923, + "step": 5290 + }, + { + "epoch": 4.113310050446255, + "grad_norm": 0.9690414667129517, + "learning_rate": 0.0002, + "loss": 1.1634, + "step": 5300 + }, + { + "epoch": 4.121071012805588, + "grad_norm": 0.7790773510932922, + "learning_rate": 0.0002, + "loss": 1.3019, + "step": 5310 + }, + { + "epoch": 4.12883197516492, + "grad_norm": 0.9289211630821228, + "learning_rate": 0.0002, + "loss": 1.1806, + "step": 5320 + }, + { + "epoch": 4.136592937524253, + "grad_norm": 1.0785125494003296, + "learning_rate": 0.0002, + "loss": 1.1458, + "step": 5330 + }, + { + "epoch": 4.144353899883585, + "grad_norm": 0.8559591770172119, + "learning_rate": 0.0002, + "loss": 1.2086, + "step": 5340 + }, + { + "epoch": 4.152114862242918, + "grad_norm": 0.9405956268310547, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 5350 + }, + { + "epoch": 4.159875824602251, + "grad_norm": 0.9942827820777893, + "learning_rate": 0.0002, + "loss": 1.1793, + "step": 5360 + }, + { + "epoch": 4.167636786961583, + "grad_norm": 0.9141933917999268, + "learning_rate": 0.0002, + "loss": 1.1659, + "step": 5370 + }, + { + "epoch": 4.175397749320916, + "grad_norm": 0.8206015229225159, + "learning_rate": 0.0002, + "loss": 1.1647, + "step": 5380 + }, + { + "epoch": 4.183158711680249, + "grad_norm": 0.9340888857841492, + "learning_rate": 0.0002, + "loss": 1.2778, + "step": 5390 + }, + { + "epoch": 4.190919674039581, + "grad_norm": 1.2122114896774292, + "learning_rate": 0.0002, + "loss": 1.2459, + "step": 5400 + }, + { + "epoch": 4.1986806363989135, + "grad_norm": 1.0661298036575317, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 5410 + }, + { + "epoch": 4.206441598758246, + "grad_norm": 0.9372861385345459, + "learning_rate": 0.0002, + "loss": 1.1978, + "step": 5420 + }, + { + "epoch": 4.214202561117578, + "grad_norm": 0.894012987613678, + "learning_rate": 0.0002, + "loss": 1.2653, + "step": 5430 + }, + { + "epoch": 4.221963523476911, + "grad_norm": 1.0647753477096558, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 5440 + }, + { + "epoch": 4.229724485836243, + "grad_norm": 0.989179790019989, + "learning_rate": 0.0002, + "loss": 1.2231, + "step": 5450 + }, + { + "epoch": 4.237485448195576, + "grad_norm": 1.1601181030273438, + "learning_rate": 0.0002, + "loss": 1.2715, + "step": 5460 + }, + { + "epoch": 4.245246410554909, + "grad_norm": 0.9395585656166077, + "learning_rate": 0.0002, + "loss": 1.2406, + "step": 5470 + }, + { + "epoch": 4.253007372914241, + "grad_norm": 0.9527766108512878, + "learning_rate": 0.0002, + "loss": 1.2779, + "step": 5480 + }, + { + "epoch": 4.260768335273574, + "grad_norm": 1.0319520235061646, + "learning_rate": 0.0002, + "loss": 1.267, + "step": 5490 + }, + { + "epoch": 4.268529297632907, + "grad_norm": 0.8659824728965759, + "learning_rate": 0.0002, + "loss": 1.2633, + "step": 5500 + }, + { + "epoch": 4.276290259992239, + "grad_norm": 1.099211573600769, + "learning_rate": 0.0002, + "loss": 1.1475, + "step": 5510 + }, + { + "epoch": 4.284051222351572, + "grad_norm": 0.9363361597061157, + "learning_rate": 0.0002, + "loss": 1.2508, + "step": 5520 + }, + { + "epoch": 4.2918121847109045, + "grad_norm": 0.8437647223472595, + "learning_rate": 0.0002, + "loss": 1.189, + "step": 5530 + }, + { + "epoch": 4.2995731470702365, + "grad_norm": 0.9181258678436279, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 5540 + }, + { + "epoch": 4.307334109429569, + "grad_norm": 0.9059357643127441, + "learning_rate": 0.0002, + "loss": 1.2092, + "step": 5550 + }, + { + "epoch": 4.315095071788901, + "grad_norm": 0.9337241649627686, + "learning_rate": 0.0002, + "loss": 1.2189, + "step": 5560 + }, + { + "epoch": 4.322856034148234, + "grad_norm": 0.9428889155387878, + "learning_rate": 0.0002, + "loss": 1.2462, + "step": 5570 + }, + { + "epoch": 4.330616996507567, + "grad_norm": 1.003589153289795, + "learning_rate": 0.0002, + "loss": 1.2675, + "step": 5580 + }, + { + "epoch": 4.338377958866899, + "grad_norm": 1.1249268054962158, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 5590 + }, + { + "epoch": 4.346138921226232, + "grad_norm": 0.8623469471931458, + "learning_rate": 0.0002, + "loss": 1.2501, + "step": 5600 + }, + { + "epoch": 4.353899883585565, + "grad_norm": 1.1389174461364746, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 5610 + }, + { + "epoch": 4.361660845944897, + "grad_norm": 1.0136264562606812, + "learning_rate": 0.0002, + "loss": 1.2245, + "step": 5620 + }, + { + "epoch": 4.36942180830423, + "grad_norm": 0.9567070603370667, + "learning_rate": 0.0002, + "loss": 1.3473, + "step": 5630 + }, + { + "epoch": 4.377182770663563, + "grad_norm": 1.0592148303985596, + "learning_rate": 0.0002, + "loss": 1.2988, + "step": 5640 + }, + { + "epoch": 4.384943733022895, + "grad_norm": 1.0110485553741455, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 5650 + }, + { + "epoch": 4.3927046953822275, + "grad_norm": 0.9914907217025757, + "learning_rate": 0.0002, + "loss": 1.2086, + "step": 5660 + }, + { + "epoch": 4.4004656577415595, + "grad_norm": 0.9447247982025146, + "learning_rate": 0.0002, + "loss": 1.2363, + "step": 5670 + }, + { + "epoch": 4.408226620100892, + "grad_norm": 0.9644378423690796, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 5680 + }, + { + "epoch": 4.415987582460225, + "grad_norm": 0.920676589012146, + "learning_rate": 0.0002, + "loss": 1.2773, + "step": 5690 + }, + { + "epoch": 4.423748544819557, + "grad_norm": 1.060570478439331, + "learning_rate": 0.0002, + "loss": 1.2792, + "step": 5700 + }, + { + "epoch": 4.43150950717889, + "grad_norm": 0.8857738971710205, + "learning_rate": 0.0002, + "loss": 1.2374, + "step": 5710 + }, + { + "epoch": 4.439270469538223, + "grad_norm": 1.0536398887634277, + "learning_rate": 0.0002, + "loss": 1.2588, + "step": 5720 + }, + { + "epoch": 4.447031431897555, + "grad_norm": 0.990847110748291, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 5730 + }, + { + "epoch": 4.454792394256888, + "grad_norm": 0.9692499041557312, + "learning_rate": 0.0002, + "loss": 1.2469, + "step": 5740 + }, + { + "epoch": 4.462553356616221, + "grad_norm": 1.0376402139663696, + "learning_rate": 0.0002, + "loss": 1.2269, + "step": 5750 + }, + { + "epoch": 4.470314318975553, + "grad_norm": 1.3863259553909302, + "learning_rate": 0.0002, + "loss": 1.1701, + "step": 5760 + }, + { + "epoch": 4.478075281334886, + "grad_norm": 0.978379487991333, + "learning_rate": 0.0002, + "loss": 1.2591, + "step": 5770 + }, + { + "epoch": 4.485836243694218, + "grad_norm": 1.0973085165023804, + "learning_rate": 0.0002, + "loss": 1.2729, + "step": 5780 + }, + { + "epoch": 4.4935972060535505, + "grad_norm": 1.057006597518921, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 5790 + }, + { + "epoch": 4.501358168412883, + "grad_norm": 0.9247729182243347, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 5800 + }, + { + "epoch": 4.509119130772215, + "grad_norm": 1.0447787046432495, + "learning_rate": 0.0002, + "loss": 1.2369, + "step": 5810 + }, + { + "epoch": 4.516880093131548, + "grad_norm": 1.1930429935455322, + "learning_rate": 0.0002, + "loss": 1.211, + "step": 5820 + }, + { + "epoch": 4.524641055490881, + "grad_norm": 0.9867590069770813, + "learning_rate": 0.0002, + "loss": 1.2596, + "step": 5830 + }, + { + "epoch": 4.532402017850213, + "grad_norm": 0.9591100215911865, + "learning_rate": 0.0002, + "loss": 1.2766, + "step": 5840 + }, + { + "epoch": 4.540162980209546, + "grad_norm": 0.9950753450393677, + "learning_rate": 0.0002, + "loss": 1.2154, + "step": 5850 + }, + { + "epoch": 4.547923942568879, + "grad_norm": 1.0087506771087646, + "learning_rate": 0.0002, + "loss": 1.2149, + "step": 5860 + }, + { + "epoch": 4.555684904928211, + "grad_norm": 1.0934417247772217, + "learning_rate": 0.0002, + "loss": 1.3165, + "step": 5870 + }, + { + "epoch": 4.563445867287544, + "grad_norm": 1.107987403869629, + "learning_rate": 0.0002, + "loss": 1.3059, + "step": 5880 + }, + { + "epoch": 4.571206829646876, + "grad_norm": 0.9147276878356934, + "learning_rate": 0.0002, + "loss": 1.2184, + "step": 5890 + }, + { + "epoch": 4.578967792006209, + "grad_norm": 1.036780595779419, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 5900 + }, + { + "epoch": 4.5867287543655415, + "grad_norm": 0.9284719824790955, + "learning_rate": 0.0002, + "loss": 1.2209, + "step": 5910 + }, + { + "epoch": 4.5944897167248735, + "grad_norm": 0.9141898155212402, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 5920 + }, + { + "epoch": 4.602250679084206, + "grad_norm": 1.0447357892990112, + "learning_rate": 0.0002, + "loss": 1.2319, + "step": 5930 + }, + { + "epoch": 4.610011641443539, + "grad_norm": 0.9309114217758179, + "learning_rate": 0.0002, + "loss": 1.2667, + "step": 5940 + }, + { + "epoch": 4.617772603802871, + "grad_norm": 1.2986129522323608, + "learning_rate": 0.0002, + "loss": 1.2827, + "step": 5950 + }, + { + "epoch": 4.625533566162204, + "grad_norm": 0.9221704602241516, + "learning_rate": 0.0002, + "loss": 1.312, + "step": 5960 + }, + { + "epoch": 4.633294528521537, + "grad_norm": 0.9228187799453735, + "learning_rate": 0.0002, + "loss": 1.2769, + "step": 5970 + }, + { + "epoch": 4.641055490880869, + "grad_norm": 0.9483116269111633, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 5980 + }, + { + "epoch": 4.648816453240202, + "grad_norm": 1.0218974351882935, + "learning_rate": 0.0002, + "loss": 1.3437, + "step": 5990 + }, + { + "epoch": 4.656577415599534, + "grad_norm": 0.9764600396156311, + "learning_rate": 0.0002, + "loss": 1.3085, + "step": 6000 + }, + { + "epoch": 4.664338377958867, + "grad_norm": 0.9115710258483887, + "learning_rate": 0.0002, + "loss": 1.197, + "step": 6010 + }, + { + "epoch": 4.6720993403182, + "grad_norm": 0.9245651364326477, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 6020 + }, + { + "epoch": 4.6798603026775325, + "grad_norm": 0.9686311483383179, + "learning_rate": 0.0002, + "loss": 1.2969, + "step": 6030 + }, + { + "epoch": 4.6876212650368645, + "grad_norm": 1.1807392835617065, + "learning_rate": 0.0002, + "loss": 1.2702, + "step": 6040 + }, + { + "epoch": 4.695382227396197, + "grad_norm": 1.0358641147613525, + "learning_rate": 0.0002, + "loss": 1.328, + "step": 6050 + }, + { + "epoch": 4.703143189755529, + "grad_norm": 0.987332284450531, + "learning_rate": 0.0002, + "loss": 1.3281, + "step": 6060 + }, + { + "epoch": 4.710904152114862, + "grad_norm": 1.0526494979858398, + "learning_rate": 0.0002, + "loss": 1.2514, + "step": 6070 + }, + { + "epoch": 4.718665114474195, + "grad_norm": 1.0276758670806885, + "learning_rate": 0.0002, + "loss": 1.2246, + "step": 6080 + }, + { + "epoch": 4.726426076833527, + "grad_norm": 0.9904406666755676, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 6090 + }, + { + "epoch": 4.73418703919286, + "grad_norm": 1.0084882974624634, + "learning_rate": 0.0002, + "loss": 1.2797, + "step": 6100 + }, + { + "epoch": 4.741948001552192, + "grad_norm": 0.8646450638771057, + "learning_rate": 0.0002, + "loss": 1.2656, + "step": 6110 + }, + { + "epoch": 4.749708963911525, + "grad_norm": 0.9233377575874329, + "learning_rate": 0.0002, + "loss": 1.3063, + "step": 6120 + }, + { + "epoch": 4.757469926270858, + "grad_norm": 0.9675140976905823, + "learning_rate": 0.0002, + "loss": 1.2642, + "step": 6130 + }, + { + "epoch": 4.765230888630191, + "grad_norm": 0.9639796018600464, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 6140 + }, + { + "epoch": 4.772991850989523, + "grad_norm": 0.925199568271637, + "learning_rate": 0.0002, + "loss": 1.276, + "step": 6150 + }, + { + "epoch": 4.7807528133488555, + "grad_norm": 1.050901174545288, + "learning_rate": 0.0002, + "loss": 1.2441, + "step": 6160 + }, + { + "epoch": 4.7885137757081875, + "grad_norm": 0.8920623660087585, + "learning_rate": 0.0002, + "loss": 1.301, + "step": 6170 + }, + { + "epoch": 4.79627473806752, + "grad_norm": 0.8964757919311523, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 6180 + }, + { + "epoch": 4.804035700426853, + "grad_norm": 1.0839070081710815, + "learning_rate": 0.0002, + "loss": 1.2787, + "step": 6190 + }, + { + "epoch": 4.811796662786185, + "grad_norm": 0.8809942007064819, + "learning_rate": 0.0002, + "loss": 1.2664, + "step": 6200 + }, + { + "epoch": 4.819557625145518, + "grad_norm": 1.0216195583343506, + "learning_rate": 0.0002, + "loss": 1.321, + "step": 6210 + }, + { + "epoch": 4.827318587504851, + "grad_norm": 0.892005980014801, + "learning_rate": 0.0002, + "loss": 1.3033, + "step": 6220 + }, + { + "epoch": 4.835079549864183, + "grad_norm": 0.9957166910171509, + "learning_rate": 0.0002, + "loss": 1.2602, + "step": 6230 + }, + { + "epoch": 4.842840512223516, + "grad_norm": 0.9720533490180969, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 6240 + }, + { + "epoch": 4.850601474582849, + "grad_norm": 0.9336182475090027, + "learning_rate": 0.0002, + "loss": 1.2651, + "step": 6250 + }, + { + "epoch": 4.858362436942181, + "grad_norm": 1.2611457109451294, + "learning_rate": 0.0002, + "loss": 1.3136, + "step": 6260 + }, + { + "epoch": 4.866123399301514, + "grad_norm": 0.8927203416824341, + "learning_rate": 0.0002, + "loss": 1.2234, + "step": 6270 + }, + { + "epoch": 4.873884361660846, + "grad_norm": 0.9706710577011108, + "learning_rate": 0.0002, + "loss": 1.3463, + "step": 6280 + }, + { + "epoch": 4.8816453240201785, + "grad_norm": 1.1461690664291382, + "learning_rate": 0.0002, + "loss": 1.3209, + "step": 6290 + }, + { + "epoch": 4.889406286379511, + "grad_norm": 0.9930381178855896, + "learning_rate": 0.0002, + "loss": 1.2566, + "step": 6300 + }, + { + "epoch": 4.897167248738843, + "grad_norm": 0.91451096534729, + "learning_rate": 0.0002, + "loss": 1.2568, + "step": 6310 + }, + { + "epoch": 4.904928211098176, + "grad_norm": 1.0319571495056152, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 6320 + }, + { + "epoch": 4.912689173457509, + "grad_norm": 0.990140438079834, + "learning_rate": 0.0002, + "loss": 1.2908, + "step": 6330 + }, + { + "epoch": 4.920450135816841, + "grad_norm": 1.2466117143630981, + "learning_rate": 0.0002, + "loss": 1.3299, + "step": 6340 + }, + { + "epoch": 4.928211098176174, + "grad_norm": 1.0316979885101318, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 6350 + }, + { + "epoch": 4.935972060535507, + "grad_norm": 1.0643759965896606, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 6360 + }, + { + "epoch": 4.943733022894839, + "grad_norm": 0.9703279733657837, + "learning_rate": 0.0002, + "loss": 1.2559, + "step": 6370 + }, + { + "epoch": 4.951493985254172, + "grad_norm": 0.9767927527427673, + "learning_rate": 0.0002, + "loss": 1.2155, + "step": 6380 + }, + { + "epoch": 4.959254947613504, + "grad_norm": 0.960854172706604, + "learning_rate": 0.0002, + "loss": 1.2437, + "step": 6390 + }, + { + "epoch": 4.967015909972837, + "grad_norm": 0.9922910332679749, + "learning_rate": 0.0002, + "loss": 1.3314, + "step": 6400 + }, + { + "epoch": 4.9747768723321695, + "grad_norm": 0.956470787525177, + "learning_rate": 0.0002, + "loss": 1.3018, + "step": 6410 + }, + { + "epoch": 4.9825378346915015, + "grad_norm": 0.9637242555618286, + "learning_rate": 0.0002, + "loss": 1.2794, + "step": 6420 + }, + { + "epoch": 4.990298797050834, + "grad_norm": 1.0855202674865723, + "learning_rate": 0.0002, + "loss": 1.3236, + "step": 6430 + }, + { + "epoch": 4.998059759410167, + "grad_norm": 0.9655316472053528, + "learning_rate": 0.0002, + "loss": 1.3015, + "step": 6440 + }, + { + "epoch": 4.9996119518820334, + "eval_loss": 2.0410802364349365, + "eval_runtime": 113.04, + "eval_samples_per_second": 4.485, + "eval_steps_per_second": 0.566, + "step": 6442 + } + ], + "logging_steps": 10, + "max_steps": 10304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.309109312094208e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8ca6e2c3ac58fa2af9f99747566f932f41a5a4d5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-6442/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f7feb06ff53d5bf79374054a25b662309e705a2ca08dfa3b0bce7b8b4632fae +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..581ba80957e863c91c09289d332dfb5bd46a2cf2 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79a559981a68c2cfc898167d3bfe19ceee9c9b2fa121024a67535beb73418906 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e82a3732717d575e777685b34150335617cf6e7 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7abbfd795613bb333de7a6ae2fcb4c78425f14d5e88acafbdbd037807c472e40 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d0af753f1792b5cc65461c7784b669deedf3fe78 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b77f041a06d5be44bc9d0380f05f012100f3636dcdba5d24755612144d65a1a +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ceb65a5bd14c886ea05ef379f95cb3f922456a34 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2547863cfabcec9c6cd9f2b203f3c8a6cc62dc0289e47e5524ad0ba794730015 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9796c0359b3c23156ed00fa8a9325539c4a7ebec --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/trainer_state.json @@ -0,0 +1,5492 @@ +{ + "best_metric": 1.8068748712539673, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 7731, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007760962359332557, + "grad_norm": 1.0751162767410278, + "learning_rate": 0.0002, + "loss": 3.0855, + "step": 10 + }, + { + "epoch": 0.015521924718665115, + "grad_norm": 0.4697345793247223, + "learning_rate": 0.0002, + "loss": 2.4744, + "step": 20 + }, + { + "epoch": 0.023282887077997673, + "grad_norm": 0.5370839238166809, + "learning_rate": 0.0002, + "loss": 2.193, + "step": 30 + }, + { + "epoch": 0.03104384943733023, + "grad_norm": 0.46794816851615906, + "learning_rate": 0.0002, + "loss": 2.0599, + "step": 40 + }, + { + "epoch": 0.038804811796662786, + "grad_norm": 0.44624820351600647, + "learning_rate": 0.0002, + "loss": 1.9354, + "step": 50 + }, + { + "epoch": 0.046565774155995346, + "grad_norm": 0.3953201472759247, + "learning_rate": 0.0002, + "loss": 1.9319, + "step": 60 + }, + { + "epoch": 0.0543267365153279, + "grad_norm": 0.3935912549495697, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 70 + }, + { + "epoch": 0.06208769887466046, + "grad_norm": 0.4520699381828308, + "learning_rate": 0.0002, + "loss": 1.8795, + "step": 80 + }, + { + "epoch": 0.06984866123399301, + "grad_norm": 0.3801847994327545, + "learning_rate": 0.0002, + "loss": 1.8354, + "step": 90 + }, + { + "epoch": 0.07760962359332557, + "grad_norm": 0.4020165205001831, + "learning_rate": 0.0002, + "loss": 1.9053, + "step": 100 + }, + { + "epoch": 0.08537058595265813, + "grad_norm": 0.3860672116279602, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 110 + }, + { + "epoch": 0.09313154831199069, + "grad_norm": 0.3681113123893738, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 120 + }, + { + "epoch": 0.10089251067132324, + "grad_norm": 0.3594866991043091, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 130 + }, + { + "epoch": 0.1086534730306558, + "grad_norm": 0.3879193663597107, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 140 + }, + { + "epoch": 0.11641443538998836, + "grad_norm": 0.3270505666732788, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 150 + }, + { + "epoch": 0.12417539774932092, + "grad_norm": 0.36824458837509155, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 160 + }, + { + "epoch": 0.13193636010865348, + "grad_norm": 0.383882075548172, + "learning_rate": 0.0002, + "loss": 1.8305, + "step": 170 + }, + { + "epoch": 0.13969732246798602, + "grad_norm": 0.3368665874004364, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 180 + }, + { + "epoch": 0.1474582848273186, + "grad_norm": 0.35961097478866577, + "learning_rate": 0.0002, + "loss": 1.7882, + "step": 190 + }, + { + "epoch": 0.15521924718665114, + "grad_norm": 0.3415963351726532, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 200 + }, + { + "epoch": 0.1629802095459837, + "grad_norm": 0.4100632071495056, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 210 + }, + { + "epoch": 0.17074117190531626, + "grad_norm": 0.3516307473182678, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 220 + }, + { + "epoch": 0.1785021342646488, + "grad_norm": 0.37919050455093384, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 230 + }, + { + "epoch": 0.18626309662398138, + "grad_norm": 0.33270683884620667, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 240 + }, + { + "epoch": 0.19402405898331393, + "grad_norm": 0.3348783254623413, + "learning_rate": 0.0002, + "loss": 1.7925, + "step": 250 + }, + { + "epoch": 0.20178502134264648, + "grad_norm": 0.3888475298881531, + "learning_rate": 0.0002, + "loss": 1.7774, + "step": 260 + }, + { + "epoch": 0.20954598370197905, + "grad_norm": 0.3554602861404419, + "learning_rate": 0.0002, + "loss": 1.8381, + "step": 270 + }, + { + "epoch": 0.2173069460613116, + "grad_norm": 0.33277708292007446, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 280 + }, + { + "epoch": 0.22506790842064417, + "grad_norm": 0.3281584680080414, + "learning_rate": 0.0002, + "loss": 1.7713, + "step": 290 + }, + { + "epoch": 0.23282887077997672, + "grad_norm": 0.3185969591140747, + "learning_rate": 0.0002, + "loss": 1.8181, + "step": 300 + }, + { + "epoch": 0.24058983313930926, + "grad_norm": 0.35335442423820496, + "learning_rate": 0.0002, + "loss": 1.8595, + "step": 310 + }, + { + "epoch": 0.24835079549864184, + "grad_norm": 0.3119595944881439, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 320 + }, + { + "epoch": 0.2561117578579744, + "grad_norm": 0.36424458026885986, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 330 + }, + { + "epoch": 0.26387272021730696, + "grad_norm": 0.3618951141834259, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 340 + }, + { + "epoch": 0.2716336825766395, + "grad_norm": 0.312757670879364, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 350 + }, + { + "epoch": 0.27939464493597205, + "grad_norm": 0.326016366481781, + "learning_rate": 0.0002, + "loss": 1.9031, + "step": 360 + }, + { + "epoch": 0.2871556072953046, + "grad_norm": 0.34093883633613586, + "learning_rate": 0.0002, + "loss": 1.8214, + "step": 370 + }, + { + "epoch": 0.2949165696546372, + "grad_norm": 0.32325029373168945, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 380 + }, + { + "epoch": 0.30267753201396974, + "grad_norm": 0.34105437994003296, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 390 + }, + { + "epoch": 0.3104384943733023, + "grad_norm": 0.32565295696258545, + "learning_rate": 0.0002, + "loss": 1.7926, + "step": 400 + }, + { + "epoch": 0.31819945673263483, + "grad_norm": 0.32742050290107727, + "learning_rate": 0.0002, + "loss": 1.8031, + "step": 410 + }, + { + "epoch": 0.3259604190919674, + "grad_norm": 0.30233046412467957, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 420 + }, + { + "epoch": 0.3337213814513, + "grad_norm": 0.32419222593307495, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 430 + }, + { + "epoch": 0.3414823438106325, + "grad_norm": 0.3653007745742798, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 440 + }, + { + "epoch": 0.3492433061699651, + "grad_norm": 0.31617099046707153, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 450 + }, + { + "epoch": 0.3570042685292976, + "grad_norm": 0.3305962085723877, + "learning_rate": 0.0002, + "loss": 1.7677, + "step": 460 + }, + { + "epoch": 0.36476523088863017, + "grad_norm": 0.3178933262825012, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 470 + }, + { + "epoch": 0.37252619324796277, + "grad_norm": 0.37163782119750977, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 480 + }, + { + "epoch": 0.3802871556072953, + "grad_norm": 0.469844788312912, + "learning_rate": 0.0002, + "loss": 1.8804, + "step": 490 + }, + { + "epoch": 0.38804811796662786, + "grad_norm": 0.3409338593482971, + "learning_rate": 0.0002, + "loss": 1.8343, + "step": 500 + }, + { + "epoch": 0.3958090803259604, + "grad_norm": 0.31943467259407043, + "learning_rate": 0.0002, + "loss": 1.8433, + "step": 510 + }, + { + "epoch": 0.40357004268529295, + "grad_norm": 0.32293614745140076, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 520 + }, + { + "epoch": 0.41133100504462555, + "grad_norm": 0.2994382977485657, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 530 + }, + { + "epoch": 0.4190919674039581, + "grad_norm": 0.3273141384124756, + "learning_rate": 0.0002, + "loss": 1.8153, + "step": 540 + }, + { + "epoch": 0.42685292976329064, + "grad_norm": 0.3020550012588501, + "learning_rate": 0.0002, + "loss": 1.8097, + "step": 550 + }, + { + "epoch": 0.4346138921226232, + "grad_norm": 0.30113112926483154, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 560 + }, + { + "epoch": 0.44237485448195574, + "grad_norm": 0.30274903774261475, + "learning_rate": 0.0002, + "loss": 1.8084, + "step": 570 + }, + { + "epoch": 0.45013581684128834, + "grad_norm": 0.3231128454208374, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 580 + }, + { + "epoch": 0.4578967792006209, + "grad_norm": 0.3255121409893036, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 590 + }, + { + "epoch": 0.46565774155995343, + "grad_norm": 0.30147507786750793, + "learning_rate": 0.0002, + "loss": 1.8227, + "step": 600 + }, + { + "epoch": 0.473418703919286, + "grad_norm": 0.29781386256217957, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 610 + }, + { + "epoch": 0.4811796662786185, + "grad_norm": 0.30914685130119324, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 620 + }, + { + "epoch": 0.4889406286379511, + "grad_norm": 0.3110593855381012, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 630 + }, + { + "epoch": 0.49670159099728367, + "grad_norm": 0.3298132121562958, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 640 + }, + { + "epoch": 0.5044625533566163, + "grad_norm": 0.322122186422348, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 650 + }, + { + "epoch": 0.5122235157159488, + "grad_norm": 0.3504371643066406, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 660 + }, + { + "epoch": 0.5199844780752814, + "grad_norm": 0.3102182149887085, + "learning_rate": 0.0002, + "loss": 1.8682, + "step": 670 + }, + { + "epoch": 0.5277454404346139, + "grad_norm": 0.6113658547401428, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 680 + }, + { + "epoch": 0.5355064027939465, + "grad_norm": 0.31841862201690674, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 690 + }, + { + "epoch": 0.543267365153279, + "grad_norm": 0.2830526530742645, + "learning_rate": 0.0002, + "loss": 1.7587, + "step": 700 + }, + { + "epoch": 0.5510283275126115, + "grad_norm": 0.3048769533634186, + "learning_rate": 0.0002, + "loss": 1.7887, + "step": 710 + }, + { + "epoch": 0.5587892898719441, + "grad_norm": 0.2719033658504486, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 720 + }, + { + "epoch": 0.5665502522312766, + "grad_norm": 0.3176722526550293, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 730 + }, + { + "epoch": 0.5743112145906092, + "grad_norm": 0.32491734623908997, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 740 + }, + { + "epoch": 0.5820721769499418, + "grad_norm": 0.32746851444244385, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 750 + }, + { + "epoch": 0.5898331393092744, + "grad_norm": 0.3055773973464966, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 760 + }, + { + "epoch": 0.5975941016686069, + "grad_norm": 0.30671584606170654, + "learning_rate": 0.0002, + "loss": 1.8597, + "step": 770 + }, + { + "epoch": 0.6053550640279395, + "grad_norm": 0.28770264983177185, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 780 + }, + { + "epoch": 0.613116026387272, + "grad_norm": 0.2814285457134247, + "learning_rate": 0.0002, + "loss": 1.7025, + "step": 790 + }, + { + "epoch": 0.6208769887466046, + "grad_norm": 0.31554412841796875, + "learning_rate": 0.0002, + "loss": 1.819, + "step": 800 + }, + { + "epoch": 0.6286379511059371, + "grad_norm": 0.2984226942062378, + "learning_rate": 0.0002, + "loss": 1.8335, + "step": 810 + }, + { + "epoch": 0.6363989134652697, + "grad_norm": 0.2859906554222107, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 820 + }, + { + "epoch": 0.6441598758246022, + "grad_norm": 0.2887928783893585, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 830 + }, + { + "epoch": 0.6519208381839348, + "grad_norm": 0.31287339329719543, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 840 + }, + { + "epoch": 0.6596818005432674, + "grad_norm": 0.32064181566238403, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 850 + }, + { + "epoch": 0.6674427629026, + "grad_norm": 0.290981650352478, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 860 + }, + { + "epoch": 0.6752037252619325, + "grad_norm": 0.33060121536254883, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 870 + }, + { + "epoch": 0.682964687621265, + "grad_norm": 0.27032899856567383, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 880 + }, + { + "epoch": 0.6907256499805976, + "grad_norm": 0.29031234979629517, + "learning_rate": 0.0002, + "loss": 1.8423, + "step": 890 + }, + { + "epoch": 0.6984866123399301, + "grad_norm": 0.2845142185688019, + "learning_rate": 0.0002, + "loss": 1.835, + "step": 900 + }, + { + "epoch": 0.7062475746992627, + "grad_norm": 0.8638312816619873, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 910 + }, + { + "epoch": 0.7140085370585952, + "grad_norm": 0.3086668848991394, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 920 + }, + { + "epoch": 0.7217694994179278, + "grad_norm": 0.2724177837371826, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 930 + }, + { + "epoch": 0.7295304617772603, + "grad_norm": 0.289559006690979, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 940 + }, + { + "epoch": 0.737291424136593, + "grad_norm": 0.3000658452510834, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 950 + }, + { + "epoch": 0.7450523864959255, + "grad_norm": 0.33544042706489563, + "learning_rate": 0.0002, + "loss": 1.7736, + "step": 960 + }, + { + "epoch": 0.7528133488552581, + "grad_norm": 0.28593236207962036, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 970 + }, + { + "epoch": 0.7605743112145906, + "grad_norm": 0.313634991645813, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 980 + }, + { + "epoch": 0.7683352735739232, + "grad_norm": 0.2949385941028595, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 990 + }, + { + "epoch": 0.7760962359332557, + "grad_norm": 0.2920108437538147, + "learning_rate": 0.0002, + "loss": 1.8689, + "step": 1000 + }, + { + "epoch": 0.7838571982925883, + "grad_norm": 0.3245100677013397, + "learning_rate": 0.0002, + "loss": 1.8401, + "step": 1010 + }, + { + "epoch": 0.7916181606519208, + "grad_norm": 0.3007619380950928, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 1020 + }, + { + "epoch": 0.7993791230112534, + "grad_norm": 0.3630852997303009, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 1030 + }, + { + "epoch": 0.8071400853705859, + "grad_norm": 0.2856379747390747, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 1040 + }, + { + "epoch": 0.8149010477299186, + "grad_norm": 0.32476478815078735, + "learning_rate": 0.0002, + "loss": 1.8371, + "step": 1050 + }, + { + "epoch": 0.8226620100892511, + "grad_norm": 0.5162565112113953, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 1060 + }, + { + "epoch": 0.8304229724485837, + "grad_norm": 0.316496342420578, + "learning_rate": 0.0002, + "loss": 1.8862, + "step": 1070 + }, + { + "epoch": 0.8381839348079162, + "grad_norm": 0.31977516412734985, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 1080 + }, + { + "epoch": 0.8459448971672487, + "grad_norm": 0.269509494304657, + "learning_rate": 0.0002, + "loss": 1.8547, + "step": 1090 + }, + { + "epoch": 0.8537058595265813, + "grad_norm": 0.31621453166007996, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 1100 + }, + { + "epoch": 0.8614668218859138, + "grad_norm": 0.2946535050868988, + "learning_rate": 0.0002, + "loss": 1.739, + "step": 1110 + }, + { + "epoch": 0.8692277842452464, + "grad_norm": 0.3088909983634949, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 1120 + }, + { + "epoch": 0.8769887466045789, + "grad_norm": 0.33033716678619385, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 1130 + }, + { + "epoch": 0.8847497089639115, + "grad_norm": 0.2954833507537842, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1140 + }, + { + "epoch": 0.8925106713232441, + "grad_norm": 0.2950248122215271, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 1150 + }, + { + "epoch": 0.9002716336825767, + "grad_norm": 0.296661913394928, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 1160 + }, + { + "epoch": 0.9080325960419092, + "grad_norm": 0.35451310873031616, + "learning_rate": 0.0002, + "loss": 1.7967, + "step": 1170 + }, + { + "epoch": 0.9157935584012418, + "grad_norm": 0.32705947756767273, + "learning_rate": 0.0002, + "loss": 1.8202, + "step": 1180 + }, + { + "epoch": 0.9235545207605743, + "grad_norm": 0.3333960771560669, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1190 + }, + { + "epoch": 0.9313154831199069, + "grad_norm": 0.3042232096195221, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 1200 + }, + { + "epoch": 0.9390764454792394, + "grad_norm": 0.281553715467453, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1210 + }, + { + "epoch": 0.946837407838572, + "grad_norm": 0.3096391558647156, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 1220 + }, + { + "epoch": 0.9545983701979045, + "grad_norm": 0.2866271734237671, + "learning_rate": 0.0002, + "loss": 1.7401, + "step": 1230 + }, + { + "epoch": 0.962359332557237, + "grad_norm": 0.28394097089767456, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 1240 + }, + { + "epoch": 0.9701202949165697, + "grad_norm": 0.3249266743659973, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 1250 + }, + { + "epoch": 0.9778812572759022, + "grad_norm": 0.2896869480609894, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1260 + }, + { + "epoch": 0.9856422196352348, + "grad_norm": 0.29224586486816406, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 1270 + }, + { + "epoch": 0.9934031819945673, + "grad_norm": 0.2820223569869995, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1280 + }, + { + "epoch": 0.9996119518820333, + "eval_loss": 1.8081045150756836, + "eval_runtime": 102.3056, + "eval_samples_per_second": 4.956, + "eval_steps_per_second": 0.626, + "step": 1288 + }, + { + "epoch": 1.0011641443538999, + "grad_norm": 0.3282551169395447, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 1290 + }, + { + "epoch": 1.0089251067132325, + "grad_norm": 0.30217495560646057, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1300 + }, + { + "epoch": 1.016686069072565, + "grad_norm": 0.30801767110824585, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 1310 + }, + { + "epoch": 1.0244470314318976, + "grad_norm": 0.31816792488098145, + "learning_rate": 0.0002, + "loss": 1.7756, + "step": 1320 + }, + { + "epoch": 1.03220799379123, + "grad_norm": 0.27794334292411804, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 1330 + }, + { + "epoch": 1.0399689561505627, + "grad_norm": 0.3018926680088043, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 1340 + }, + { + "epoch": 1.0477299185098952, + "grad_norm": 0.3552975356578827, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1350 + }, + { + "epoch": 1.0554908808692278, + "grad_norm": 0.32590144872665405, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 1360 + }, + { + "epoch": 1.0632518432285603, + "grad_norm": 0.3435460925102234, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 1370 + }, + { + "epoch": 1.071012805587893, + "grad_norm": 0.35037797689437866, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 1380 + }, + { + "epoch": 1.0787737679472253, + "grad_norm": 0.31398263573646545, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 1390 + }, + { + "epoch": 1.086534730306558, + "grad_norm": 0.3134010434150696, + "learning_rate": 0.0002, + "loss": 1.6729, + "step": 1400 + }, + { + "epoch": 1.0942956926658907, + "grad_norm": 0.4599704444408417, + "learning_rate": 0.0002, + "loss": 1.751, + "step": 1410 + }, + { + "epoch": 1.102056655025223, + "grad_norm": 0.35852891206741333, + "learning_rate": 0.0002, + "loss": 1.6871, + "step": 1420 + }, + { + "epoch": 1.1098176173845558, + "grad_norm": 0.35628634691238403, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 1430 + }, + { + "epoch": 1.1175785797438882, + "grad_norm": 0.3769161105155945, + "learning_rate": 0.0002, + "loss": 1.6166, + "step": 1440 + }, + { + "epoch": 1.1253395421032208, + "grad_norm": 1.3712416887283325, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1450 + }, + { + "epoch": 1.1331005044625533, + "grad_norm": 0.38406670093536377, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 1460 + }, + { + "epoch": 1.140861466821886, + "grad_norm": 0.3402116000652313, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 1470 + }, + { + "epoch": 1.1486224291812184, + "grad_norm": 0.341189444065094, + "learning_rate": 0.0002, + "loss": 1.7074, + "step": 1480 + }, + { + "epoch": 1.156383391540551, + "grad_norm": 0.36629995703697205, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 1490 + }, + { + "epoch": 1.1641443538998835, + "grad_norm": 0.3499569296836853, + "learning_rate": 0.0002, + "loss": 1.6952, + "step": 1500 + }, + { + "epoch": 1.1719053162592161, + "grad_norm": 0.3663063943386078, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 1510 + }, + { + "epoch": 1.1796662786185488, + "grad_norm": 0.34851500391960144, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1520 + }, + { + "epoch": 1.1874272409778812, + "grad_norm": 0.35071656107902527, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 1530 + }, + { + "epoch": 1.1951882033372139, + "grad_norm": 0.42783796787261963, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 1540 + }, + { + "epoch": 1.2029491656965463, + "grad_norm": 0.31830692291259766, + "learning_rate": 0.0002, + "loss": 1.7499, + "step": 1550 + }, + { + "epoch": 1.210710128055879, + "grad_norm": 0.3597424626350403, + "learning_rate": 0.0002, + "loss": 1.7372, + "step": 1560 + }, + { + "epoch": 1.2184710904152114, + "grad_norm": 0.35233765840530396, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 1570 + }, + { + "epoch": 1.226232052774544, + "grad_norm": 0.35942912101745605, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 1580 + }, + { + "epoch": 1.2339930151338767, + "grad_norm": 0.36159393191337585, + "learning_rate": 0.0002, + "loss": 1.6598, + "step": 1590 + }, + { + "epoch": 1.2417539774932091, + "grad_norm": 0.3328469693660736, + "learning_rate": 0.0002, + "loss": 1.6697, + "step": 1600 + }, + { + "epoch": 1.2495149398525418, + "grad_norm": 0.3089476525783539, + "learning_rate": 0.0002, + "loss": 1.7594, + "step": 1610 + }, + { + "epoch": 1.2572759022118742, + "grad_norm": 0.30947765707969666, + "learning_rate": 0.0002, + "loss": 1.6805, + "step": 1620 + }, + { + "epoch": 1.265036864571207, + "grad_norm": 0.32154011726379395, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 1630 + }, + { + "epoch": 1.2727978269305393, + "grad_norm": 0.3480297923088074, + "learning_rate": 0.0002, + "loss": 1.6621, + "step": 1640 + }, + { + "epoch": 1.280558789289872, + "grad_norm": 0.39471694827079773, + "learning_rate": 0.0002, + "loss": 1.7087, + "step": 1650 + }, + { + "epoch": 1.2883197516492044, + "grad_norm": 0.35728853940963745, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 1660 + }, + { + "epoch": 1.296080714008537, + "grad_norm": 0.35223081707954407, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 1670 + }, + { + "epoch": 1.3038416763678695, + "grad_norm": 0.3588867485523224, + "learning_rate": 0.0002, + "loss": 1.7253, + "step": 1680 + }, + { + "epoch": 1.3116026387272022, + "grad_norm": 0.3528042733669281, + "learning_rate": 0.0002, + "loss": 1.6505, + "step": 1690 + }, + { + "epoch": 1.3193636010865348, + "grad_norm": 0.35975801944732666, + "learning_rate": 0.0002, + "loss": 1.6945, + "step": 1700 + }, + { + "epoch": 1.3271245634458673, + "grad_norm": 0.36691880226135254, + "learning_rate": 0.0002, + "loss": 1.6631, + "step": 1710 + }, + { + "epoch": 1.3348855258052, + "grad_norm": 0.3787977695465088, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1720 + }, + { + "epoch": 1.3426464881645324, + "grad_norm": 0.36614933609962463, + "learning_rate": 0.0002, + "loss": 1.7697, + "step": 1730 + }, + { + "epoch": 1.350407450523865, + "grad_norm": 0.3484745919704437, + "learning_rate": 0.0002, + "loss": 1.6487, + "step": 1740 + }, + { + "epoch": 1.3581684128831975, + "grad_norm": 0.36905673146247864, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 1750 + }, + { + "epoch": 1.36592937524253, + "grad_norm": 0.41564738750457764, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 1760 + }, + { + "epoch": 1.3736903376018628, + "grad_norm": 0.3345205783843994, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 1770 + }, + { + "epoch": 1.3814512999611952, + "grad_norm": 0.34926071763038635, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 1780 + }, + { + "epoch": 1.3892122623205276, + "grad_norm": 0.42004233598709106, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 1790 + }, + { + "epoch": 1.3969732246798603, + "grad_norm": 0.3576236963272095, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 1800 + }, + { + "epoch": 1.404734187039193, + "grad_norm": 0.3586704432964325, + "learning_rate": 0.0002, + "loss": 1.8516, + "step": 1810 + }, + { + "epoch": 1.4124951493985254, + "grad_norm": 0.3943439722061157, + "learning_rate": 0.0002, + "loss": 1.6171, + "step": 1820 + }, + { + "epoch": 1.420256111757858, + "grad_norm": 0.3484877049922943, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 1830 + }, + { + "epoch": 1.4280170741171905, + "grad_norm": 0.3344518840312958, + "learning_rate": 0.0002, + "loss": 1.7205, + "step": 1840 + }, + { + "epoch": 1.4357780364765231, + "grad_norm": 0.4345698356628418, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1850 + }, + { + "epoch": 1.4435389988358556, + "grad_norm": 0.5525162220001221, + "learning_rate": 0.0002, + "loss": 1.6855, + "step": 1860 + }, + { + "epoch": 1.4512999611951882, + "grad_norm": 0.37194496393203735, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1870 + }, + { + "epoch": 1.4590609235545209, + "grad_norm": 0.34570157527923584, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1880 + }, + { + "epoch": 1.4668218859138533, + "grad_norm": 0.3512282073497772, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 1890 + }, + { + "epoch": 1.4745828482731858, + "grad_norm": 0.3443922996520996, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1900 + }, + { + "epoch": 1.4823438106325184, + "grad_norm": 0.3812018036842346, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 1910 + }, + { + "epoch": 1.490104772991851, + "grad_norm": 0.39263492822647095, + "learning_rate": 0.0002, + "loss": 1.7277, + "step": 1920 + }, + { + "epoch": 1.4978657353511835, + "grad_norm": 0.3146156072616577, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1930 + }, + { + "epoch": 1.505626697710516, + "grad_norm": 0.3653988540172577, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 1940 + }, + { + "epoch": 1.5133876600698488, + "grad_norm": 0.3966596722602844, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 1950 + }, + { + "epoch": 1.5211486224291813, + "grad_norm": 0.3441697359085083, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1960 + }, + { + "epoch": 1.5289095847885137, + "grad_norm": 0.3328564465045929, + "learning_rate": 0.0002, + "loss": 1.7175, + "step": 1970 + }, + { + "epoch": 1.5366705471478463, + "grad_norm": 0.34068772196769714, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 1980 + }, + { + "epoch": 1.544431509507179, + "grad_norm": 0.3559795916080475, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 1990 + }, + { + "epoch": 1.5521924718665114, + "grad_norm": 0.37888768315315247, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 2000 + }, + { + "epoch": 1.5599534342258439, + "grad_norm": 0.36128363013267517, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 2010 + }, + { + "epoch": 1.5677143965851765, + "grad_norm": 0.3643714487552643, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 2020 + }, + { + "epoch": 1.5754753589445092, + "grad_norm": 0.3863612115383148, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 2030 + }, + { + "epoch": 1.5832363213038416, + "grad_norm": 0.32831457257270813, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 2040 + }, + { + "epoch": 1.5909972836631743, + "grad_norm": 0.36098113656044006, + "learning_rate": 0.0002, + "loss": 1.7404, + "step": 2050 + }, + { + "epoch": 1.598758246022507, + "grad_norm": 1.1079334020614624, + "learning_rate": 0.0002, + "loss": 1.7065, + "step": 2060 + }, + { + "epoch": 1.6065192083818394, + "grad_norm": 0.35615381598472595, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2070 + }, + { + "epoch": 1.6142801707411718, + "grad_norm": 0.369711309671402, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 2080 + }, + { + "epoch": 1.6220411331005045, + "grad_norm": 0.390658438205719, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 2090 + }, + { + "epoch": 1.6298020954598371, + "grad_norm": 0.3422999382019043, + "learning_rate": 0.0002, + "loss": 1.6996, + "step": 2100 + }, + { + "epoch": 1.6375630578191696, + "grad_norm": 0.372475266456604, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 2110 + }, + { + "epoch": 1.645324020178502, + "grad_norm": 0.35660576820373535, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 2120 + }, + { + "epoch": 1.6530849825378346, + "grad_norm": 0.35754942893981934, + "learning_rate": 0.0002, + "loss": 1.6991, + "step": 2130 + }, + { + "epoch": 1.6608459448971673, + "grad_norm": 0.34572410583496094, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 2140 + }, + { + "epoch": 1.6686069072564997, + "grad_norm": 0.42059701681137085, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 2150 + }, + { + "epoch": 1.6763678696158324, + "grad_norm": 0.35200759768486023, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 2160 + }, + { + "epoch": 1.684128831975165, + "grad_norm": 0.3704029321670532, + "learning_rate": 0.0002, + "loss": 1.6869, + "step": 2170 + }, + { + "epoch": 1.6918897943344975, + "grad_norm": 0.40450501441955566, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 2180 + }, + { + "epoch": 1.69965075669383, + "grad_norm": 0.362966924905777, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2190 + }, + { + "epoch": 1.7074117190531626, + "grad_norm": 0.36586204171180725, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 2200 + }, + { + "epoch": 1.7151726814124952, + "grad_norm": 0.3295372426509857, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 2210 + }, + { + "epoch": 1.7229336437718277, + "grad_norm": 0.3892575800418854, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 2220 + }, + { + "epoch": 1.73069460613116, + "grad_norm": 0.34712135791778564, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 2230 + }, + { + "epoch": 1.738455568490493, + "grad_norm": 0.34801796078681946, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 2240 + }, + { + "epoch": 1.7462165308498254, + "grad_norm": 0.3822397291660309, + "learning_rate": 0.0002, + "loss": 1.7009, + "step": 2250 + }, + { + "epoch": 1.7539774932091579, + "grad_norm": 0.38933250308036804, + "learning_rate": 0.0002, + "loss": 1.6546, + "step": 2260 + }, + { + "epoch": 1.7617384555684905, + "grad_norm": 0.3798373341560364, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 2270 + }, + { + "epoch": 1.7694994179278232, + "grad_norm": 0.35151317715644836, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 2280 + }, + { + "epoch": 1.7772603802871556, + "grad_norm": 0.44981494545936584, + "learning_rate": 0.0002, + "loss": 1.6894, + "step": 2290 + }, + { + "epoch": 1.785021342646488, + "grad_norm": 0.3992624580860138, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 2300 + }, + { + "epoch": 1.7927823050058207, + "grad_norm": 0.3772512376308441, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 2310 + }, + { + "epoch": 1.8005432673651534, + "grad_norm": 0.3511589467525482, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2320 + }, + { + "epoch": 1.8083042297244858, + "grad_norm": 0.3805285394191742, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 2330 + }, + { + "epoch": 1.8160651920838184, + "grad_norm": 0.3792071044445038, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 2340 + }, + { + "epoch": 1.823826154443151, + "grad_norm": 0.36430829763412476, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 2350 + }, + { + "epoch": 1.8315871168024835, + "grad_norm": 0.36502477526664734, + "learning_rate": 0.0002, + "loss": 1.6773, + "step": 2360 + }, + { + "epoch": 1.839348079161816, + "grad_norm": 0.35015153884887695, + "learning_rate": 0.0002, + "loss": 1.8072, + "step": 2370 + }, + { + "epoch": 1.8471090415211486, + "grad_norm": 0.3710903823375702, + "learning_rate": 0.0002, + "loss": 1.7734, + "step": 2380 + }, + { + "epoch": 1.8548700038804813, + "grad_norm": 0.3542828857898712, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 2390 + }, + { + "epoch": 1.8626309662398137, + "grad_norm": 0.35467568039894104, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 2400 + }, + { + "epoch": 1.8703919285991462, + "grad_norm": 0.3638560473918915, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 2410 + }, + { + "epoch": 1.8781528909584788, + "grad_norm": 0.3823298215866089, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 2420 + }, + { + "epoch": 1.8859138533178115, + "grad_norm": 0.3926416337490082, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 2430 + }, + { + "epoch": 1.893674815677144, + "grad_norm": 0.3608079254627228, + "learning_rate": 0.0002, + "loss": 1.71, + "step": 2440 + }, + { + "epoch": 1.9014357780364766, + "grad_norm": 0.3426613509654999, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 2450 + }, + { + "epoch": 1.9091967403958092, + "grad_norm": 0.3522338569164276, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 2460 + }, + { + "epoch": 1.9169577027551417, + "grad_norm": 0.3608049154281616, + "learning_rate": 0.0002, + "loss": 1.7307, + "step": 2470 + }, + { + "epoch": 1.924718665114474, + "grad_norm": 0.3849755525588989, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 2480 + }, + { + "epoch": 1.9324796274738067, + "grad_norm": 0.4154011011123657, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 2490 + }, + { + "epoch": 1.9402405898331394, + "grad_norm": 0.3602796792984009, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 2500 + }, + { + "epoch": 1.9480015521924718, + "grad_norm": 0.3702992796897888, + "learning_rate": 0.0002, + "loss": 1.7843, + "step": 2510 + }, + { + "epoch": 1.9557625145518043, + "grad_norm": 0.3657735288143158, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 2520 + }, + { + "epoch": 1.963523476911137, + "grad_norm": 0.41031739115715027, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2530 + }, + { + "epoch": 1.9712844392704696, + "grad_norm": 0.34578680992126465, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 2540 + }, + { + "epoch": 1.979045401629802, + "grad_norm": 0.3361521065235138, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 2550 + }, + { + "epoch": 1.9868063639891347, + "grad_norm": 0.34342363476753235, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 2560 + }, + { + "epoch": 1.9945673263484673, + "grad_norm": 0.32954007387161255, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 2570 + }, + { + "epoch": 2.0, + "eval_loss": 1.8068748712539673, + "eval_runtime": 105.5885, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 2577 + }, + { + "epoch": 2.0023282887077998, + "grad_norm": 0.336302250623703, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 2580 + }, + { + "epoch": 2.010089251067132, + "grad_norm": 0.3627048432826996, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 2590 + }, + { + "epoch": 2.017850213426465, + "grad_norm": 0.38406702876091003, + "learning_rate": 0.0002, + "loss": 1.4908, + "step": 2600 + }, + { + "epoch": 2.0256111757857975, + "grad_norm": 0.5326781272888184, + "learning_rate": 0.0002, + "loss": 1.5368, + "step": 2610 + }, + { + "epoch": 2.03337213814513, + "grad_norm": 0.4774554967880249, + "learning_rate": 0.0002, + "loss": 1.5727, + "step": 2620 + }, + { + "epoch": 2.0411331005044624, + "grad_norm": 0.4251810312271118, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 2630 + }, + { + "epoch": 2.0488940628637953, + "grad_norm": 0.4693007171154022, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 2640 + }, + { + "epoch": 2.0566550252231277, + "grad_norm": 0.46371519565582275, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 2650 + }, + { + "epoch": 2.06441598758246, + "grad_norm": 0.46652570366859436, + "learning_rate": 0.0002, + "loss": 1.6304, + "step": 2660 + }, + { + "epoch": 2.0721769499417926, + "grad_norm": 0.45200315117836, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 2670 + }, + { + "epoch": 2.0799379123011255, + "grad_norm": 0.42905205488204956, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 2680 + }, + { + "epoch": 2.087698874660458, + "grad_norm": 0.44509148597717285, + "learning_rate": 0.0002, + "loss": 1.5401, + "step": 2690 + }, + { + "epoch": 2.0954598370197903, + "grad_norm": 0.4445319175720215, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 2700 + }, + { + "epoch": 2.103220799379123, + "grad_norm": 0.46825504302978516, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 2710 + }, + { + "epoch": 2.1109817617384556, + "grad_norm": 0.4623856842517853, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 2720 + }, + { + "epoch": 2.118742724097788, + "grad_norm": 0.4833452105522156, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2730 + }, + { + "epoch": 2.1265036864571205, + "grad_norm": 0.4582686722278595, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 2740 + }, + { + "epoch": 2.1342646488164534, + "grad_norm": 0.47587934136390686, + "learning_rate": 0.0002, + "loss": 1.5801, + "step": 2750 + }, + { + "epoch": 2.142025611175786, + "grad_norm": 0.4602217972278595, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 2760 + }, + { + "epoch": 2.1497865735351183, + "grad_norm": 0.47501352429389954, + "learning_rate": 0.0002, + "loss": 1.5271, + "step": 2770 + }, + { + "epoch": 2.1575475358944507, + "grad_norm": 0.5078499913215637, + "learning_rate": 0.0002, + "loss": 1.4862, + "step": 2780 + }, + { + "epoch": 2.1653084982537836, + "grad_norm": 0.497704416513443, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 2790 + }, + { + "epoch": 2.173069460613116, + "grad_norm": 0.5435971617698669, + "learning_rate": 0.0002, + "loss": 1.5597, + "step": 2800 + }, + { + "epoch": 2.1808304229724484, + "grad_norm": 0.5172356367111206, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 2810 + }, + { + "epoch": 2.1885913853317813, + "grad_norm": 0.44063422083854675, + "learning_rate": 0.0002, + "loss": 1.5202, + "step": 2820 + }, + { + "epoch": 2.1963523476911138, + "grad_norm": 0.5079569220542908, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 2830 + }, + { + "epoch": 2.204113310050446, + "grad_norm": 0.45658132433891296, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2840 + }, + { + "epoch": 2.2118742724097786, + "grad_norm": 0.5103023648262024, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 2850 + }, + { + "epoch": 2.2196352347691115, + "grad_norm": 0.4882226288318634, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 2860 + }, + { + "epoch": 2.227396197128444, + "grad_norm": 0.5087296962738037, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 2870 + }, + { + "epoch": 2.2351571594877764, + "grad_norm": 0.45293712615966797, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 2880 + }, + { + "epoch": 2.242918121847109, + "grad_norm": 0.5120379328727722, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 2890 + }, + { + "epoch": 2.2506790842064417, + "grad_norm": 0.47126415371894836, + "learning_rate": 0.0002, + "loss": 1.5273, + "step": 2900 + }, + { + "epoch": 2.258440046565774, + "grad_norm": 0.44005846977233887, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 2910 + }, + { + "epoch": 2.2662010089251066, + "grad_norm": 0.46476176381111145, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 2920 + }, + { + "epoch": 2.2739619712844394, + "grad_norm": 0.48051515221595764, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 2930 + }, + { + "epoch": 2.281722933643772, + "grad_norm": 0.480069637298584, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2940 + }, + { + "epoch": 2.2894838960031043, + "grad_norm": 0.5122102499008179, + "learning_rate": 0.0002, + "loss": 1.5747, + "step": 2950 + }, + { + "epoch": 2.2972448583624367, + "grad_norm": 0.48879891633987427, + "learning_rate": 0.0002, + "loss": 1.5183, + "step": 2960 + }, + { + "epoch": 2.3050058207217696, + "grad_norm": 0.4973136782646179, + "learning_rate": 0.0002, + "loss": 1.5483, + "step": 2970 + }, + { + "epoch": 2.312766783081102, + "grad_norm": 0.5522695183753967, + "learning_rate": 0.0002, + "loss": 1.677, + "step": 2980 + }, + { + "epoch": 2.3205277454404345, + "grad_norm": 0.5220217704772949, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 2990 + }, + { + "epoch": 2.328288707799767, + "grad_norm": 0.4978662431240082, + "learning_rate": 0.0002, + "loss": 1.6299, + "step": 3000 + }, + { + "epoch": 2.3360496701591, + "grad_norm": 0.554053544998169, + "learning_rate": 0.0002, + "loss": 1.5498, + "step": 3010 + }, + { + "epoch": 2.3438106325184322, + "grad_norm": 0.4703886806964874, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 3020 + }, + { + "epoch": 2.3515715948777647, + "grad_norm": 0.5074123740196228, + "learning_rate": 0.0002, + "loss": 1.5418, + "step": 3030 + }, + { + "epoch": 2.3593325572370976, + "grad_norm": 0.5088278651237488, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 3040 + }, + { + "epoch": 2.36709351959643, + "grad_norm": 0.4752114415168762, + "learning_rate": 0.0002, + "loss": 1.5249, + "step": 3050 + }, + { + "epoch": 2.3748544819557624, + "grad_norm": 0.5121659636497498, + "learning_rate": 0.0002, + "loss": 1.5353, + "step": 3060 + }, + { + "epoch": 2.3826154443150953, + "grad_norm": 0.48649218678474426, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 3070 + }, + { + "epoch": 2.3903764066744277, + "grad_norm": 0.5209488868713379, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 3080 + }, + { + "epoch": 2.39813736903376, + "grad_norm": 0.5110517740249634, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 3090 + }, + { + "epoch": 2.4058983313930926, + "grad_norm": 0.5609337091445923, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3100 + }, + { + "epoch": 2.4136592937524255, + "grad_norm": 0.5191826224327087, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 3110 + }, + { + "epoch": 2.421420256111758, + "grad_norm": 0.4876069724559784, + "learning_rate": 0.0002, + "loss": 1.6347, + "step": 3120 + }, + { + "epoch": 2.4291812184710904, + "grad_norm": 0.4713933765888214, + "learning_rate": 0.0002, + "loss": 1.5565, + "step": 3130 + }, + { + "epoch": 2.436942180830423, + "grad_norm": 0.5102227330207825, + "learning_rate": 0.0002, + "loss": 1.6388, + "step": 3140 + }, + { + "epoch": 2.4447031431897557, + "grad_norm": 0.44546666741371155, + "learning_rate": 0.0002, + "loss": 1.5667, + "step": 3150 + }, + { + "epoch": 2.452464105549088, + "grad_norm": 0.5167558193206787, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3160 + }, + { + "epoch": 2.4602250679084205, + "grad_norm": 0.5226958990097046, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 3170 + }, + { + "epoch": 2.4679860302677534, + "grad_norm": 0.4751799702644348, + "learning_rate": 0.0002, + "loss": 1.5758, + "step": 3180 + }, + { + "epoch": 2.475746992627086, + "grad_norm": 0.4744729697704315, + "learning_rate": 0.0002, + "loss": 1.6234, + "step": 3190 + }, + { + "epoch": 2.4835079549864183, + "grad_norm": 0.5203230381011963, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 3200 + }, + { + "epoch": 2.4912689173457507, + "grad_norm": 0.47209781408309937, + "learning_rate": 0.0002, + "loss": 1.493, + "step": 3210 + }, + { + "epoch": 2.4990298797050836, + "grad_norm": 0.5241674780845642, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3220 + }, + { + "epoch": 2.506790842064416, + "grad_norm": 0.5152244567871094, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 3230 + }, + { + "epoch": 2.5145518044237485, + "grad_norm": 0.5216741561889648, + "learning_rate": 0.0002, + "loss": 1.6248, + "step": 3240 + }, + { + "epoch": 2.522312766783081, + "grad_norm": 0.4953259527683258, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 3250 + }, + { + "epoch": 2.530073729142414, + "grad_norm": 0.5973829030990601, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 3260 + }, + { + "epoch": 2.5378346915017462, + "grad_norm": 0.48804202675819397, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 3270 + }, + { + "epoch": 2.5455956538610787, + "grad_norm": 0.5334644317626953, + "learning_rate": 0.0002, + "loss": 1.4954, + "step": 3280 + }, + { + "epoch": 2.5533566162204115, + "grad_norm": 0.46873313188552856, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 3290 + }, + { + "epoch": 2.561117578579744, + "grad_norm": 0.4282589554786682, + "learning_rate": 0.0002, + "loss": 1.5362, + "step": 3300 + }, + { + "epoch": 2.5688785409390764, + "grad_norm": 0.4848293960094452, + "learning_rate": 0.0002, + "loss": 1.6278, + "step": 3310 + }, + { + "epoch": 2.576639503298409, + "grad_norm": 0.5093745589256287, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 3320 + }, + { + "epoch": 2.5844004656577413, + "grad_norm": 0.5084842443466187, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 3330 + }, + { + "epoch": 2.592161428017074, + "grad_norm": 0.4696281850337982, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 3340 + }, + { + "epoch": 2.5999223903764066, + "grad_norm": 0.5767765641212463, + "learning_rate": 0.0002, + "loss": 1.5359, + "step": 3350 + }, + { + "epoch": 2.607683352735739, + "grad_norm": 0.47300875186920166, + "learning_rate": 0.0002, + "loss": 1.6097, + "step": 3360 + }, + { + "epoch": 2.615444315095072, + "grad_norm": 0.4809158146381378, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 3370 + }, + { + "epoch": 2.6232052774544043, + "grad_norm": 0.5141063928604126, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 3380 + }, + { + "epoch": 2.630966239813737, + "grad_norm": 0.4832935035228729, + "learning_rate": 0.0002, + "loss": 1.5784, + "step": 3390 + }, + { + "epoch": 2.6387272021730697, + "grad_norm": 0.5044625401496887, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 3400 + }, + { + "epoch": 2.646488164532402, + "grad_norm": 0.5287680625915527, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 3410 + }, + { + "epoch": 2.6542491268917345, + "grad_norm": 0.5306379795074463, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 3420 + }, + { + "epoch": 2.662010089251067, + "grad_norm": 0.5849291682243347, + "learning_rate": 0.0002, + "loss": 1.5264, + "step": 3430 + }, + { + "epoch": 2.6697710516104, + "grad_norm": 0.7951080799102783, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 3440 + }, + { + "epoch": 2.6775320139697323, + "grad_norm": 0.48087653517723083, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 3450 + }, + { + "epoch": 2.6852929763290647, + "grad_norm": 0.5396431684494019, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 3460 + }, + { + "epoch": 2.693053938688397, + "grad_norm": 0.5481634736061096, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 3470 + }, + { + "epoch": 2.70081490104773, + "grad_norm": 0.5068731307983398, + "learning_rate": 0.0002, + "loss": 1.6436, + "step": 3480 + }, + { + "epoch": 2.7085758634070625, + "grad_norm": 0.5759826898574829, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 3490 + }, + { + "epoch": 2.716336825766395, + "grad_norm": 0.7253932952880859, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 3500 + }, + { + "epoch": 2.724097788125728, + "grad_norm": 0.527745246887207, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 3510 + }, + { + "epoch": 2.73185875048506, + "grad_norm": 0.5279242396354675, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 3520 + }, + { + "epoch": 2.7396197128443927, + "grad_norm": 0.5047839283943176, + "learning_rate": 0.0002, + "loss": 1.6768, + "step": 3530 + }, + { + "epoch": 2.7473806752037255, + "grad_norm": 0.5430883169174194, + "learning_rate": 0.0002, + "loss": 1.5517, + "step": 3540 + }, + { + "epoch": 2.755141637563058, + "grad_norm": 0.4496723711490631, + "learning_rate": 0.0002, + "loss": 1.5624, + "step": 3550 + }, + { + "epoch": 2.7629025999223904, + "grad_norm": 0.5063338875770569, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 3560 + }, + { + "epoch": 2.770663562281723, + "grad_norm": 0.4619026780128479, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 3570 + }, + { + "epoch": 2.7784245246410553, + "grad_norm": 0.4753304123878479, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 3580 + }, + { + "epoch": 2.786185487000388, + "grad_norm": 0.5422708988189697, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 3590 + }, + { + "epoch": 2.7939464493597206, + "grad_norm": 0.4756578803062439, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 3600 + }, + { + "epoch": 2.801707411719053, + "grad_norm": 0.5057567358016968, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 3610 + }, + { + "epoch": 2.809468374078386, + "grad_norm": 0.5410919785499573, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 3620 + }, + { + "epoch": 2.8172293364377183, + "grad_norm": 0.4958136975765228, + "learning_rate": 0.0002, + "loss": 1.5573, + "step": 3630 + }, + { + "epoch": 2.8249902987970508, + "grad_norm": 0.454527348279953, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 3640 + }, + { + "epoch": 2.8327512611563836, + "grad_norm": 0.5092706084251404, + "learning_rate": 0.0002, + "loss": 1.5582, + "step": 3650 + }, + { + "epoch": 2.840512223515716, + "grad_norm": 0.5314022302627563, + "learning_rate": 0.0002, + "loss": 1.5893, + "step": 3660 + }, + { + "epoch": 2.8482731858750485, + "grad_norm": 0.5028239488601685, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 3670 + }, + { + "epoch": 2.856034148234381, + "grad_norm": 0.5127444863319397, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 3680 + }, + { + "epoch": 2.8637951105937134, + "grad_norm": 0.5045645236968994, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 3690 + }, + { + "epoch": 2.8715560729530463, + "grad_norm": 0.5560781955718994, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 3700 + }, + { + "epoch": 2.8793170353123787, + "grad_norm": 0.5177600383758545, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 3710 + }, + { + "epoch": 2.887077997671711, + "grad_norm": 0.45830899477005005, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 3720 + }, + { + "epoch": 2.894838960031044, + "grad_norm": 0.4828629195690155, + "learning_rate": 0.0002, + "loss": 1.6344, + "step": 3730 + }, + { + "epoch": 2.9025999223903765, + "grad_norm": 0.48241183161735535, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 3740 + }, + { + "epoch": 2.910360884749709, + "grad_norm": 0.4909592568874359, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 3750 + }, + { + "epoch": 2.9181218471090418, + "grad_norm": 0.44677025079727173, + "learning_rate": 0.0002, + "loss": 1.4927, + "step": 3760 + }, + { + "epoch": 2.925882809468374, + "grad_norm": 0.4928834140300751, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 3770 + }, + { + "epoch": 2.9336437718277066, + "grad_norm": 0.5673553347587585, + "learning_rate": 0.0002, + "loss": 1.5843, + "step": 3780 + }, + { + "epoch": 2.941404734187039, + "grad_norm": 0.548190712928772, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 3790 + }, + { + "epoch": 2.9491656965463715, + "grad_norm": 0.48979803919792175, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 3800 + }, + { + "epoch": 2.9569266589057044, + "grad_norm": 0.533191978931427, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 3810 + }, + { + "epoch": 2.964687621265037, + "grad_norm": 0.5362946391105652, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 3820 + }, + { + "epoch": 2.9724485836243693, + "grad_norm": 0.4724906384944916, + "learning_rate": 0.0002, + "loss": 1.6602, + "step": 3830 + }, + { + "epoch": 2.980209545983702, + "grad_norm": 0.5468461513519287, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 3840 + }, + { + "epoch": 2.9879705083430346, + "grad_norm": 0.4697108864784241, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 3850 + }, + { + "epoch": 2.995731470702367, + "grad_norm": 0.4780906140804291, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 3860 + }, + { + "epoch": 2.9996119518820334, + "eval_loss": 1.8472607135772705, + "eval_runtime": 106.5541, + "eval_samples_per_second": 4.758, + "eval_steps_per_second": 0.601, + "step": 3865 + }, + { + "epoch": 3.0034924330616994, + "grad_norm": 0.5645653605461121, + "learning_rate": 0.0002, + "loss": 1.4983, + "step": 3870 + }, + { + "epoch": 3.0112533954210323, + "grad_norm": 0.6457151174545288, + "learning_rate": 0.0002, + "loss": 1.4334, + "step": 3880 + }, + { + "epoch": 3.0190143577803648, + "grad_norm": 0.583838164806366, + "learning_rate": 0.0002, + "loss": 1.3899, + "step": 3890 + }, + { + "epoch": 3.026775320139697, + "grad_norm": 0.6819260120391846, + "learning_rate": 0.0002, + "loss": 1.3258, + "step": 3900 + }, + { + "epoch": 3.03453628249903, + "grad_norm": 0.6692903637886047, + "learning_rate": 0.0002, + "loss": 1.3458, + "step": 3910 + }, + { + "epoch": 3.0422972448583625, + "grad_norm": 0.6101024746894836, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 3920 + }, + { + "epoch": 3.050058207217695, + "grad_norm": 0.7014093399047852, + "learning_rate": 0.0002, + "loss": 1.394, + "step": 3930 + }, + { + "epoch": 3.0578191695770274, + "grad_norm": 0.7380381226539612, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 3940 + }, + { + "epoch": 3.0655801319363603, + "grad_norm": 0.6607900857925415, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 3950 + }, + { + "epoch": 3.0733410942956927, + "grad_norm": 0.735263466835022, + "learning_rate": 0.0002, + "loss": 1.4293, + "step": 3960 + }, + { + "epoch": 3.081102056655025, + "grad_norm": 0.6788513660430908, + "learning_rate": 0.0002, + "loss": 1.3966, + "step": 3970 + }, + { + "epoch": 3.088863019014358, + "grad_norm": 0.6347652673721313, + "learning_rate": 0.0002, + "loss": 1.3435, + "step": 3980 + }, + { + "epoch": 3.0966239813736904, + "grad_norm": 0.7056642770767212, + "learning_rate": 0.0002, + "loss": 1.4518, + "step": 3990 + }, + { + "epoch": 3.104384943733023, + "grad_norm": 0.6387075185775757, + "learning_rate": 0.0002, + "loss": 1.4474, + "step": 4000 + }, + { + "epoch": 3.1121459060923553, + "grad_norm": 0.6701116561889648, + "learning_rate": 0.0002, + "loss": 1.3833, + "step": 4010 + }, + { + "epoch": 3.119906868451688, + "grad_norm": 0.7558449506759644, + "learning_rate": 0.0002, + "loss": 1.404, + "step": 4020 + }, + { + "epoch": 3.1276678308110206, + "grad_norm": 0.6612881422042847, + "learning_rate": 0.0002, + "loss": 1.3294, + "step": 4030 + }, + { + "epoch": 3.135428793170353, + "grad_norm": 0.7474587559700012, + "learning_rate": 0.0002, + "loss": 1.439, + "step": 4040 + }, + { + "epoch": 3.1431897555296855, + "grad_norm": 0.7292373776435852, + "learning_rate": 0.0002, + "loss": 1.4616, + "step": 4050 + }, + { + "epoch": 3.1509507178890184, + "grad_norm": 0.7432886958122253, + "learning_rate": 0.0002, + "loss": 1.3908, + "step": 4060 + }, + { + "epoch": 3.158711680248351, + "grad_norm": 0.6366098523139954, + "learning_rate": 0.0002, + "loss": 1.4214, + "step": 4070 + }, + { + "epoch": 3.1664726426076832, + "grad_norm": 0.6837611794471741, + "learning_rate": 0.0002, + "loss": 1.5044, + "step": 4080 + }, + { + "epoch": 3.174233604967016, + "grad_norm": 0.7194393277168274, + "learning_rate": 0.0002, + "loss": 1.4332, + "step": 4090 + }, + { + "epoch": 3.1819945673263486, + "grad_norm": 0.6963607668876648, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 4100 + }, + { + "epoch": 3.189755529685681, + "grad_norm": 0.6404902935028076, + "learning_rate": 0.0002, + "loss": 1.4127, + "step": 4110 + }, + { + "epoch": 3.1975164920450134, + "grad_norm": 0.7172070741653442, + "learning_rate": 0.0002, + "loss": 1.4394, + "step": 4120 + }, + { + "epoch": 3.2052774544043463, + "grad_norm": 0.6577759385108948, + "learning_rate": 0.0002, + "loss": 1.4658, + "step": 4130 + }, + { + "epoch": 3.2130384167636787, + "grad_norm": 0.6658480167388916, + "learning_rate": 0.0002, + "loss": 1.4019, + "step": 4140 + }, + { + "epoch": 3.220799379123011, + "grad_norm": 0.6771699786186218, + "learning_rate": 0.0002, + "loss": 1.4348, + "step": 4150 + }, + { + "epoch": 3.2285603414823436, + "grad_norm": 0.699035108089447, + "learning_rate": 0.0002, + "loss": 1.4736, + "step": 4160 + }, + { + "epoch": 3.2363213038416765, + "grad_norm": 0.7218514680862427, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 4170 + }, + { + "epoch": 3.244082266201009, + "grad_norm": 0.6270631551742554, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 4180 + }, + { + "epoch": 3.2518432285603414, + "grad_norm": 0.6828921437263489, + "learning_rate": 0.0002, + "loss": 1.4076, + "step": 4190 + }, + { + "epoch": 3.2596041909196742, + "grad_norm": 0.6005498170852661, + "learning_rate": 0.0002, + "loss": 1.4663, + "step": 4200 + }, + { + "epoch": 3.2673651532790067, + "grad_norm": 0.6974790692329407, + "learning_rate": 0.0002, + "loss": 1.4798, + "step": 4210 + }, + { + "epoch": 3.275126115638339, + "grad_norm": 0.7269543409347534, + "learning_rate": 0.0002, + "loss": 1.5012, + "step": 4220 + }, + { + "epoch": 3.2828870779976715, + "grad_norm": 0.6728787422180176, + "learning_rate": 0.0002, + "loss": 1.3848, + "step": 4230 + }, + { + "epoch": 3.2906480403570044, + "grad_norm": 0.676972508430481, + "learning_rate": 0.0002, + "loss": 1.4112, + "step": 4240 + }, + { + "epoch": 3.298409002716337, + "grad_norm": 0.748309314250946, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 4250 + }, + { + "epoch": 3.3061699650756693, + "grad_norm": 0.6976589560508728, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 4260 + }, + { + "epoch": 3.3139309274350017, + "grad_norm": 0.649780809879303, + "learning_rate": 0.0002, + "loss": 1.3967, + "step": 4270 + }, + { + "epoch": 3.3216918897943346, + "grad_norm": 0.6529902815818787, + "learning_rate": 0.0002, + "loss": 1.327, + "step": 4280 + }, + { + "epoch": 3.329452852153667, + "grad_norm": 0.9273163676261902, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 4290 + }, + { + "epoch": 3.3372138145129995, + "grad_norm": 0.717024028301239, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 4300 + }, + { + "epoch": 3.3449747768723324, + "grad_norm": 0.7914950251579285, + "learning_rate": 0.0002, + "loss": 1.4441, + "step": 4310 + }, + { + "epoch": 3.352735739231665, + "grad_norm": 0.7133203148841858, + "learning_rate": 0.0002, + "loss": 1.432, + "step": 4320 + }, + { + "epoch": 3.3604967015909972, + "grad_norm": 0.7409568428993225, + "learning_rate": 0.0002, + "loss": 1.4662, + "step": 4330 + }, + { + "epoch": 3.3682576639503297, + "grad_norm": 0.6993981003761292, + "learning_rate": 0.0002, + "loss": 1.3992, + "step": 4340 + }, + { + "epoch": 3.3760186263096625, + "grad_norm": 0.7114535570144653, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 4350 + }, + { + "epoch": 3.383779588668995, + "grad_norm": 0.6790860295295715, + "learning_rate": 0.0002, + "loss": 1.4227, + "step": 4360 + }, + { + "epoch": 3.3915405510283274, + "grad_norm": 0.6507849097251892, + "learning_rate": 0.0002, + "loss": 1.4128, + "step": 4370 + }, + { + "epoch": 3.39930151338766, + "grad_norm": 0.5967804193496704, + "learning_rate": 0.0002, + "loss": 1.4559, + "step": 4380 + }, + { + "epoch": 3.4070624757469927, + "grad_norm": 0.6625847816467285, + "learning_rate": 0.0002, + "loss": 1.3687, + "step": 4390 + }, + { + "epoch": 3.414823438106325, + "grad_norm": 0.6736508011817932, + "learning_rate": 0.0002, + "loss": 1.4193, + "step": 4400 + }, + { + "epoch": 3.4225844004656576, + "grad_norm": 0.7870860695838928, + "learning_rate": 0.0002, + "loss": 1.4363, + "step": 4410 + }, + { + "epoch": 3.4303453628249905, + "grad_norm": 0.7205295562744141, + "learning_rate": 0.0002, + "loss": 1.4114, + "step": 4420 + }, + { + "epoch": 3.438106325184323, + "grad_norm": 0.6634634137153625, + "learning_rate": 0.0002, + "loss": 1.4131, + "step": 4430 + }, + { + "epoch": 3.4458672875436553, + "grad_norm": 0.7562733292579651, + "learning_rate": 0.0002, + "loss": 1.4683, + "step": 4440 + }, + { + "epoch": 3.453628249902988, + "grad_norm": 0.6585879921913147, + "learning_rate": 0.0002, + "loss": 1.3486, + "step": 4450 + }, + { + "epoch": 3.4613892122623207, + "grad_norm": 0.6896792054176331, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 4460 + }, + { + "epoch": 3.469150174621653, + "grad_norm": 0.6520342230796814, + "learning_rate": 0.0002, + "loss": 1.4208, + "step": 4470 + }, + { + "epoch": 3.4769111369809855, + "grad_norm": 0.6760806441307068, + "learning_rate": 0.0002, + "loss": 1.3423, + "step": 4480 + }, + { + "epoch": 3.484672099340318, + "grad_norm": 0.7539774179458618, + "learning_rate": 0.0002, + "loss": 1.4398, + "step": 4490 + }, + { + "epoch": 3.492433061699651, + "grad_norm": 0.7409411668777466, + "learning_rate": 0.0002, + "loss": 1.4534, + "step": 4500 + }, + { + "epoch": 3.5001940240589833, + "grad_norm": 0.6876253485679626, + "learning_rate": 0.0002, + "loss": 1.4069, + "step": 4510 + }, + { + "epoch": 3.5079549864183157, + "grad_norm": 0.7028461694717407, + "learning_rate": 0.0002, + "loss": 1.4228, + "step": 4520 + }, + { + "epoch": 3.5157159487776486, + "grad_norm": 0.8056529760360718, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 4530 + }, + { + "epoch": 3.523476911136981, + "grad_norm": 0.711338996887207, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 4540 + }, + { + "epoch": 3.5312378734963135, + "grad_norm": 0.7343552708625793, + "learning_rate": 0.0002, + "loss": 1.5247, + "step": 4550 + }, + { + "epoch": 3.5389988358556463, + "grad_norm": 0.745479941368103, + "learning_rate": 0.0002, + "loss": 1.4308, + "step": 4560 + }, + { + "epoch": 3.5467597982149788, + "grad_norm": 0.7582294940948486, + "learning_rate": 0.0002, + "loss": 1.4229, + "step": 4570 + }, + { + "epoch": 3.554520760574311, + "grad_norm": 0.6717444658279419, + "learning_rate": 0.0002, + "loss": 1.4127, + "step": 4580 + }, + { + "epoch": 3.5622817229336436, + "grad_norm": 0.7417883276939392, + "learning_rate": 0.0002, + "loss": 1.4368, + "step": 4590 + }, + { + "epoch": 3.570042685292976, + "grad_norm": 0.6385737061500549, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 4600 + }, + { + "epoch": 3.577803647652309, + "grad_norm": 0.716704249382019, + "learning_rate": 0.0002, + "loss": 1.3981, + "step": 4610 + }, + { + "epoch": 3.5855646100116414, + "grad_norm": 0.6948980093002319, + "learning_rate": 0.0002, + "loss": 1.3889, + "step": 4620 + }, + { + "epoch": 3.593325572370974, + "grad_norm": 0.6961140036582947, + "learning_rate": 0.0002, + "loss": 1.5177, + "step": 4630 + }, + { + "epoch": 3.6010865347303067, + "grad_norm": 0.7493122220039368, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 4640 + }, + { + "epoch": 3.608847497089639, + "grad_norm": 0.7431658506393433, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 4650 + }, + { + "epoch": 3.6166084594489716, + "grad_norm": 0.8353387713432312, + "learning_rate": 0.0002, + "loss": 1.4551, + "step": 4660 + }, + { + "epoch": 3.6243694218083045, + "grad_norm": 0.7095612287521362, + "learning_rate": 0.0002, + "loss": 1.4533, + "step": 4670 + }, + { + "epoch": 3.632130384167637, + "grad_norm": 0.776620090007782, + "learning_rate": 0.0002, + "loss": 1.4003, + "step": 4680 + }, + { + "epoch": 3.6398913465269693, + "grad_norm": 0.7198925018310547, + "learning_rate": 0.0002, + "loss": 1.4361, + "step": 4690 + }, + { + "epoch": 3.6476523088863018, + "grad_norm": 0.8238834738731384, + "learning_rate": 0.0002, + "loss": 1.4543, + "step": 4700 + }, + { + "epoch": 3.655413271245634, + "grad_norm": 0.6804245710372925, + "learning_rate": 0.0002, + "loss": 1.3958, + "step": 4710 + }, + { + "epoch": 3.663174233604967, + "grad_norm": 0.8444845676422119, + "learning_rate": 0.0002, + "loss": 1.4158, + "step": 4720 + }, + { + "epoch": 3.6709351959642995, + "grad_norm": 0.743797779083252, + "learning_rate": 0.0002, + "loss": 1.3825, + "step": 4730 + }, + { + "epoch": 3.678696158323632, + "grad_norm": 0.8994188904762268, + "learning_rate": 0.0002, + "loss": 1.4213, + "step": 4740 + }, + { + "epoch": 3.686457120682965, + "grad_norm": 0.75416100025177, + "learning_rate": 0.0002, + "loss": 1.4281, + "step": 4750 + }, + { + "epoch": 3.6942180830422973, + "grad_norm": 0.6499266028404236, + "learning_rate": 0.0002, + "loss": 1.4154, + "step": 4760 + }, + { + "epoch": 3.7019790454016297, + "grad_norm": 0.7246791124343872, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 4770 + }, + { + "epoch": 3.7097400077609626, + "grad_norm": 0.7831124067306519, + "learning_rate": 0.0002, + "loss": 1.426, + "step": 4780 + }, + { + "epoch": 3.717500970120295, + "grad_norm": 0.7130028009414673, + "learning_rate": 0.0002, + "loss": 1.3933, + "step": 4790 + }, + { + "epoch": 3.7252619324796274, + "grad_norm": 0.7501602172851562, + "learning_rate": 0.0002, + "loss": 1.4632, + "step": 4800 + }, + { + "epoch": 3.73302289483896, + "grad_norm": 0.6980932950973511, + "learning_rate": 0.0002, + "loss": 1.4985, + "step": 4810 + }, + { + "epoch": 3.7407838571982923, + "grad_norm": 0.8050530552864075, + "learning_rate": 0.0002, + "loss": 1.4517, + "step": 4820 + }, + { + "epoch": 3.748544819557625, + "grad_norm": 0.6385579705238342, + "learning_rate": 0.0002, + "loss": 1.4703, + "step": 4830 + }, + { + "epoch": 3.7563057819169576, + "grad_norm": 0.6664714813232422, + "learning_rate": 0.0002, + "loss": 1.5281, + "step": 4840 + }, + { + "epoch": 3.76406674427629, + "grad_norm": 0.7125676274299622, + "learning_rate": 0.0002, + "loss": 1.4443, + "step": 4850 + }, + { + "epoch": 3.771827706635623, + "grad_norm": 0.7231866717338562, + "learning_rate": 0.0002, + "loss": 1.3958, + "step": 4860 + }, + { + "epoch": 3.7795886689949554, + "grad_norm": 0.6917183995246887, + "learning_rate": 0.0002, + "loss": 1.4446, + "step": 4870 + }, + { + "epoch": 3.787349631354288, + "grad_norm": 0.665037989616394, + "learning_rate": 0.0002, + "loss": 1.4369, + "step": 4880 + }, + { + "epoch": 3.7951105937136207, + "grad_norm": 0.5837726593017578, + "learning_rate": 0.0002, + "loss": 1.4193, + "step": 4890 + }, + { + "epoch": 3.802871556072953, + "grad_norm": 0.6366701722145081, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 4900 + }, + { + "epoch": 3.8106325184322856, + "grad_norm": 0.7082223892211914, + "learning_rate": 0.0002, + "loss": 1.46, + "step": 4910 + }, + { + "epoch": 3.818393480791618, + "grad_norm": 0.8101672530174255, + "learning_rate": 0.0002, + "loss": 1.5139, + "step": 4920 + }, + { + "epoch": 3.826154443150951, + "grad_norm": 0.7516148090362549, + "learning_rate": 0.0002, + "loss": 1.3659, + "step": 4930 + }, + { + "epoch": 3.8339154055102833, + "grad_norm": 0.7928489446640015, + "learning_rate": 0.0002, + "loss": 1.3909, + "step": 4940 + }, + { + "epoch": 3.8416763678696157, + "grad_norm": 0.6892234683036804, + "learning_rate": 0.0002, + "loss": 1.4255, + "step": 4950 + }, + { + "epoch": 3.849437330228948, + "grad_norm": 0.6381304264068604, + "learning_rate": 0.0002, + "loss": 1.5024, + "step": 4960 + }, + { + "epoch": 3.857198292588281, + "grad_norm": 0.8068831562995911, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 4970 + }, + { + "epoch": 3.8649592549476135, + "grad_norm": 0.7289869785308838, + "learning_rate": 0.0002, + "loss": 1.45, + "step": 4980 + }, + { + "epoch": 3.872720217306946, + "grad_norm": 0.7278549075126648, + "learning_rate": 0.0002, + "loss": 1.398, + "step": 4990 + }, + { + "epoch": 3.880481179666279, + "grad_norm": 0.7324236631393433, + "learning_rate": 0.0002, + "loss": 1.4442, + "step": 5000 + }, + { + "epoch": 3.8882421420256112, + "grad_norm": 0.6759871244430542, + "learning_rate": 0.0002, + "loss": 1.4511, + "step": 5010 + }, + { + "epoch": 3.8960031043849437, + "grad_norm": 0.8159207701683044, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 5020 + }, + { + "epoch": 3.9037640667442766, + "grad_norm": 0.6536211967468262, + "learning_rate": 0.0002, + "loss": 1.4685, + "step": 5030 + }, + { + "epoch": 3.911525029103609, + "grad_norm": 0.6827932000160217, + "learning_rate": 0.0002, + "loss": 1.4335, + "step": 5040 + }, + { + "epoch": 3.9192859914629414, + "grad_norm": 0.6688340306282043, + "learning_rate": 0.0002, + "loss": 1.433, + "step": 5050 + }, + { + "epoch": 3.927046953822274, + "grad_norm": 0.6385695934295654, + "learning_rate": 0.0002, + "loss": 1.4099, + "step": 5060 + }, + { + "epoch": 3.9348079161816063, + "grad_norm": 0.6975107192993164, + "learning_rate": 0.0002, + "loss": 1.4767, + "step": 5070 + }, + { + "epoch": 3.942568878540939, + "grad_norm": 0.6684112548828125, + "learning_rate": 0.0002, + "loss": 1.4893, + "step": 5080 + }, + { + "epoch": 3.9503298409002716, + "grad_norm": 0.8349628448486328, + "learning_rate": 0.0002, + "loss": 1.4732, + "step": 5090 + }, + { + "epoch": 3.958090803259604, + "grad_norm": 0.7146425843238831, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 5100 + }, + { + "epoch": 3.965851765618937, + "grad_norm": 0.6555036902427673, + "learning_rate": 0.0002, + "loss": 1.4149, + "step": 5110 + }, + { + "epoch": 3.9736127279782694, + "grad_norm": 0.7037415504455566, + "learning_rate": 0.0002, + "loss": 1.4274, + "step": 5120 + }, + { + "epoch": 3.981373690337602, + "grad_norm": 0.7235575914382935, + "learning_rate": 0.0002, + "loss": 1.4292, + "step": 5130 + }, + { + "epoch": 3.9891346526969347, + "grad_norm": 0.7092325687408447, + "learning_rate": 0.0002, + "loss": 1.4455, + "step": 5140 + }, + { + "epoch": 3.996895615056267, + "grad_norm": 0.7490319609642029, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 5150 + }, + { + "epoch": 4.0, + "eval_loss": 1.9131355285644531, + "eval_runtime": 105.5778, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 5154 + }, + { + "epoch": 4.0046565774155995, + "grad_norm": 0.7075854539871216, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 5160 + }, + { + "epoch": 4.012417539774932, + "grad_norm": 0.9466007351875305, + "learning_rate": 0.0002, + "loss": 1.209, + "step": 5170 + }, + { + "epoch": 4.020178502134264, + "grad_norm": 1.0297044515609741, + "learning_rate": 0.0002, + "loss": 1.2567, + "step": 5180 + }, + { + "epoch": 4.027939464493597, + "grad_norm": 0.7765059471130371, + "learning_rate": 0.0002, + "loss": 1.1796, + "step": 5190 + }, + { + "epoch": 4.03570042685293, + "grad_norm": 0.995760977268219, + "learning_rate": 0.0002, + "loss": 1.2356, + "step": 5200 + }, + { + "epoch": 4.043461389212262, + "grad_norm": 0.8663829565048218, + "learning_rate": 0.0002, + "loss": 1.1792, + "step": 5210 + }, + { + "epoch": 4.051222351571595, + "grad_norm": 1.0660825967788696, + "learning_rate": 0.0002, + "loss": 1.2471, + "step": 5220 + }, + { + "epoch": 4.058983313930927, + "grad_norm": 0.9858174920082092, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 5230 + }, + { + "epoch": 4.06674427629026, + "grad_norm": 0.8911338448524475, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 5240 + }, + { + "epoch": 4.074505238649593, + "grad_norm": 1.0848394632339478, + "learning_rate": 0.0002, + "loss": 1.1858, + "step": 5250 + }, + { + "epoch": 4.082266201008925, + "grad_norm": 1.0849905014038086, + "learning_rate": 0.0002, + "loss": 1.1684, + "step": 5260 + }, + { + "epoch": 4.090027163368258, + "grad_norm": 1.0497841835021973, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 5270 + }, + { + "epoch": 4.0977881257275905, + "grad_norm": 0.8943053483963013, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 5280 + }, + { + "epoch": 4.1055490880869225, + "grad_norm": 0.8432527184486389, + "learning_rate": 0.0002, + "loss": 1.1923, + "step": 5290 + }, + { + "epoch": 4.113310050446255, + "grad_norm": 0.9690414667129517, + "learning_rate": 0.0002, + "loss": 1.1634, + "step": 5300 + }, + { + "epoch": 4.121071012805588, + "grad_norm": 0.7790773510932922, + "learning_rate": 0.0002, + "loss": 1.3019, + "step": 5310 + }, + { + "epoch": 4.12883197516492, + "grad_norm": 0.9289211630821228, + "learning_rate": 0.0002, + "loss": 1.1806, + "step": 5320 + }, + { + "epoch": 4.136592937524253, + "grad_norm": 1.0785125494003296, + "learning_rate": 0.0002, + "loss": 1.1458, + "step": 5330 + }, + { + "epoch": 4.144353899883585, + "grad_norm": 0.8559591770172119, + "learning_rate": 0.0002, + "loss": 1.2086, + "step": 5340 + }, + { + "epoch": 4.152114862242918, + "grad_norm": 0.9405956268310547, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 5350 + }, + { + "epoch": 4.159875824602251, + "grad_norm": 0.9942827820777893, + "learning_rate": 0.0002, + "loss": 1.1793, + "step": 5360 + }, + { + "epoch": 4.167636786961583, + "grad_norm": 0.9141933917999268, + "learning_rate": 0.0002, + "loss": 1.1659, + "step": 5370 + }, + { + "epoch": 4.175397749320916, + "grad_norm": 0.8206015229225159, + "learning_rate": 0.0002, + "loss": 1.1647, + "step": 5380 + }, + { + "epoch": 4.183158711680249, + "grad_norm": 0.9340888857841492, + "learning_rate": 0.0002, + "loss": 1.2778, + "step": 5390 + }, + { + "epoch": 4.190919674039581, + "grad_norm": 1.2122114896774292, + "learning_rate": 0.0002, + "loss": 1.2459, + "step": 5400 + }, + { + "epoch": 4.1986806363989135, + "grad_norm": 1.0661298036575317, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 5410 + }, + { + "epoch": 4.206441598758246, + "grad_norm": 0.9372861385345459, + "learning_rate": 0.0002, + "loss": 1.1978, + "step": 5420 + }, + { + "epoch": 4.214202561117578, + "grad_norm": 0.894012987613678, + "learning_rate": 0.0002, + "loss": 1.2653, + "step": 5430 + }, + { + "epoch": 4.221963523476911, + "grad_norm": 1.0647753477096558, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 5440 + }, + { + "epoch": 4.229724485836243, + "grad_norm": 0.989179790019989, + "learning_rate": 0.0002, + "loss": 1.2231, + "step": 5450 + }, + { + "epoch": 4.237485448195576, + "grad_norm": 1.1601181030273438, + "learning_rate": 0.0002, + "loss": 1.2715, + "step": 5460 + }, + { + "epoch": 4.245246410554909, + "grad_norm": 0.9395585656166077, + "learning_rate": 0.0002, + "loss": 1.2406, + "step": 5470 + }, + { + "epoch": 4.253007372914241, + "grad_norm": 0.9527766108512878, + "learning_rate": 0.0002, + "loss": 1.2779, + "step": 5480 + }, + { + "epoch": 4.260768335273574, + "grad_norm": 1.0319520235061646, + "learning_rate": 0.0002, + "loss": 1.267, + "step": 5490 + }, + { + "epoch": 4.268529297632907, + "grad_norm": 0.8659824728965759, + "learning_rate": 0.0002, + "loss": 1.2633, + "step": 5500 + }, + { + "epoch": 4.276290259992239, + "grad_norm": 1.099211573600769, + "learning_rate": 0.0002, + "loss": 1.1475, + "step": 5510 + }, + { + "epoch": 4.284051222351572, + "grad_norm": 0.9363361597061157, + "learning_rate": 0.0002, + "loss": 1.2508, + "step": 5520 + }, + { + "epoch": 4.2918121847109045, + "grad_norm": 0.8437647223472595, + "learning_rate": 0.0002, + "loss": 1.189, + "step": 5530 + }, + { + "epoch": 4.2995731470702365, + "grad_norm": 0.9181258678436279, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 5540 + }, + { + "epoch": 4.307334109429569, + "grad_norm": 0.9059357643127441, + "learning_rate": 0.0002, + "loss": 1.2092, + "step": 5550 + }, + { + "epoch": 4.315095071788901, + "grad_norm": 0.9337241649627686, + "learning_rate": 0.0002, + "loss": 1.2189, + "step": 5560 + }, + { + "epoch": 4.322856034148234, + "grad_norm": 0.9428889155387878, + "learning_rate": 0.0002, + "loss": 1.2462, + "step": 5570 + }, + { + "epoch": 4.330616996507567, + "grad_norm": 1.003589153289795, + "learning_rate": 0.0002, + "loss": 1.2675, + "step": 5580 + }, + { + "epoch": 4.338377958866899, + "grad_norm": 1.1249268054962158, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 5590 + }, + { + "epoch": 4.346138921226232, + "grad_norm": 0.8623469471931458, + "learning_rate": 0.0002, + "loss": 1.2501, + "step": 5600 + }, + { + "epoch": 4.353899883585565, + "grad_norm": 1.1389174461364746, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 5610 + }, + { + "epoch": 4.361660845944897, + "grad_norm": 1.0136264562606812, + "learning_rate": 0.0002, + "loss": 1.2245, + "step": 5620 + }, + { + "epoch": 4.36942180830423, + "grad_norm": 0.9567070603370667, + "learning_rate": 0.0002, + "loss": 1.3473, + "step": 5630 + }, + { + "epoch": 4.377182770663563, + "grad_norm": 1.0592148303985596, + "learning_rate": 0.0002, + "loss": 1.2988, + "step": 5640 + }, + { + "epoch": 4.384943733022895, + "grad_norm": 1.0110485553741455, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 5650 + }, + { + "epoch": 4.3927046953822275, + "grad_norm": 0.9914907217025757, + "learning_rate": 0.0002, + "loss": 1.2086, + "step": 5660 + }, + { + "epoch": 4.4004656577415595, + "grad_norm": 0.9447247982025146, + "learning_rate": 0.0002, + "loss": 1.2363, + "step": 5670 + }, + { + "epoch": 4.408226620100892, + "grad_norm": 0.9644378423690796, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 5680 + }, + { + "epoch": 4.415987582460225, + "grad_norm": 0.920676589012146, + "learning_rate": 0.0002, + "loss": 1.2773, + "step": 5690 + }, + { + "epoch": 4.423748544819557, + "grad_norm": 1.060570478439331, + "learning_rate": 0.0002, + "loss": 1.2792, + "step": 5700 + }, + { + "epoch": 4.43150950717889, + "grad_norm": 0.8857738971710205, + "learning_rate": 0.0002, + "loss": 1.2374, + "step": 5710 + }, + { + "epoch": 4.439270469538223, + "grad_norm": 1.0536398887634277, + "learning_rate": 0.0002, + "loss": 1.2588, + "step": 5720 + }, + { + "epoch": 4.447031431897555, + "grad_norm": 0.990847110748291, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 5730 + }, + { + "epoch": 4.454792394256888, + "grad_norm": 0.9692499041557312, + "learning_rate": 0.0002, + "loss": 1.2469, + "step": 5740 + }, + { + "epoch": 4.462553356616221, + "grad_norm": 1.0376402139663696, + "learning_rate": 0.0002, + "loss": 1.2269, + "step": 5750 + }, + { + "epoch": 4.470314318975553, + "grad_norm": 1.3863259553909302, + "learning_rate": 0.0002, + "loss": 1.1701, + "step": 5760 + }, + { + "epoch": 4.478075281334886, + "grad_norm": 0.978379487991333, + "learning_rate": 0.0002, + "loss": 1.2591, + "step": 5770 + }, + { + "epoch": 4.485836243694218, + "grad_norm": 1.0973085165023804, + "learning_rate": 0.0002, + "loss": 1.2729, + "step": 5780 + }, + { + "epoch": 4.4935972060535505, + "grad_norm": 1.057006597518921, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 5790 + }, + { + "epoch": 4.501358168412883, + "grad_norm": 0.9247729182243347, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 5800 + }, + { + "epoch": 4.509119130772215, + "grad_norm": 1.0447787046432495, + "learning_rate": 0.0002, + "loss": 1.2369, + "step": 5810 + }, + { + "epoch": 4.516880093131548, + "grad_norm": 1.1930429935455322, + "learning_rate": 0.0002, + "loss": 1.211, + "step": 5820 + }, + { + "epoch": 4.524641055490881, + "grad_norm": 0.9867590069770813, + "learning_rate": 0.0002, + "loss": 1.2596, + "step": 5830 + }, + { + "epoch": 4.532402017850213, + "grad_norm": 0.9591100215911865, + "learning_rate": 0.0002, + "loss": 1.2766, + "step": 5840 + }, + { + "epoch": 4.540162980209546, + "grad_norm": 0.9950753450393677, + "learning_rate": 0.0002, + "loss": 1.2154, + "step": 5850 + }, + { + "epoch": 4.547923942568879, + "grad_norm": 1.0087506771087646, + "learning_rate": 0.0002, + "loss": 1.2149, + "step": 5860 + }, + { + "epoch": 4.555684904928211, + "grad_norm": 1.0934417247772217, + "learning_rate": 0.0002, + "loss": 1.3165, + "step": 5870 + }, + { + "epoch": 4.563445867287544, + "grad_norm": 1.107987403869629, + "learning_rate": 0.0002, + "loss": 1.3059, + "step": 5880 + }, + { + "epoch": 4.571206829646876, + "grad_norm": 0.9147276878356934, + "learning_rate": 0.0002, + "loss": 1.2184, + "step": 5890 + }, + { + "epoch": 4.578967792006209, + "grad_norm": 1.036780595779419, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 5900 + }, + { + "epoch": 4.5867287543655415, + "grad_norm": 0.9284719824790955, + "learning_rate": 0.0002, + "loss": 1.2209, + "step": 5910 + }, + { + "epoch": 4.5944897167248735, + "grad_norm": 0.9141898155212402, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 5920 + }, + { + "epoch": 4.602250679084206, + "grad_norm": 1.0447357892990112, + "learning_rate": 0.0002, + "loss": 1.2319, + "step": 5930 + }, + { + "epoch": 4.610011641443539, + "grad_norm": 0.9309114217758179, + "learning_rate": 0.0002, + "loss": 1.2667, + "step": 5940 + }, + { + "epoch": 4.617772603802871, + "grad_norm": 1.2986129522323608, + "learning_rate": 0.0002, + "loss": 1.2827, + "step": 5950 + }, + { + "epoch": 4.625533566162204, + "grad_norm": 0.9221704602241516, + "learning_rate": 0.0002, + "loss": 1.312, + "step": 5960 + }, + { + "epoch": 4.633294528521537, + "grad_norm": 0.9228187799453735, + "learning_rate": 0.0002, + "loss": 1.2769, + "step": 5970 + }, + { + "epoch": 4.641055490880869, + "grad_norm": 0.9483116269111633, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 5980 + }, + { + "epoch": 4.648816453240202, + "grad_norm": 1.0218974351882935, + "learning_rate": 0.0002, + "loss": 1.3437, + "step": 5990 + }, + { + "epoch": 4.656577415599534, + "grad_norm": 0.9764600396156311, + "learning_rate": 0.0002, + "loss": 1.3085, + "step": 6000 + }, + { + "epoch": 4.664338377958867, + "grad_norm": 0.9115710258483887, + "learning_rate": 0.0002, + "loss": 1.197, + "step": 6010 + }, + { + "epoch": 4.6720993403182, + "grad_norm": 0.9245651364326477, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 6020 + }, + { + "epoch": 4.6798603026775325, + "grad_norm": 0.9686311483383179, + "learning_rate": 0.0002, + "loss": 1.2969, + "step": 6030 + }, + { + "epoch": 4.6876212650368645, + "grad_norm": 1.1807392835617065, + "learning_rate": 0.0002, + "loss": 1.2702, + "step": 6040 + }, + { + "epoch": 4.695382227396197, + "grad_norm": 1.0358641147613525, + "learning_rate": 0.0002, + "loss": 1.328, + "step": 6050 + }, + { + "epoch": 4.703143189755529, + "grad_norm": 0.987332284450531, + "learning_rate": 0.0002, + "loss": 1.3281, + "step": 6060 + }, + { + "epoch": 4.710904152114862, + "grad_norm": 1.0526494979858398, + "learning_rate": 0.0002, + "loss": 1.2514, + "step": 6070 + }, + { + "epoch": 4.718665114474195, + "grad_norm": 1.0276758670806885, + "learning_rate": 0.0002, + "loss": 1.2246, + "step": 6080 + }, + { + "epoch": 4.726426076833527, + "grad_norm": 0.9904406666755676, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 6090 + }, + { + "epoch": 4.73418703919286, + "grad_norm": 1.0084882974624634, + "learning_rate": 0.0002, + "loss": 1.2797, + "step": 6100 + }, + { + "epoch": 4.741948001552192, + "grad_norm": 0.8646450638771057, + "learning_rate": 0.0002, + "loss": 1.2656, + "step": 6110 + }, + { + "epoch": 4.749708963911525, + "grad_norm": 0.9233377575874329, + "learning_rate": 0.0002, + "loss": 1.3063, + "step": 6120 + }, + { + "epoch": 4.757469926270858, + "grad_norm": 0.9675140976905823, + "learning_rate": 0.0002, + "loss": 1.2642, + "step": 6130 + }, + { + "epoch": 4.765230888630191, + "grad_norm": 0.9639796018600464, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 6140 + }, + { + "epoch": 4.772991850989523, + "grad_norm": 0.925199568271637, + "learning_rate": 0.0002, + "loss": 1.276, + "step": 6150 + }, + { + "epoch": 4.7807528133488555, + "grad_norm": 1.050901174545288, + "learning_rate": 0.0002, + "loss": 1.2441, + "step": 6160 + }, + { + "epoch": 4.7885137757081875, + "grad_norm": 0.8920623660087585, + "learning_rate": 0.0002, + "loss": 1.301, + "step": 6170 + }, + { + "epoch": 4.79627473806752, + "grad_norm": 0.8964757919311523, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 6180 + }, + { + "epoch": 4.804035700426853, + "grad_norm": 1.0839070081710815, + "learning_rate": 0.0002, + "loss": 1.2787, + "step": 6190 + }, + { + "epoch": 4.811796662786185, + "grad_norm": 0.8809942007064819, + "learning_rate": 0.0002, + "loss": 1.2664, + "step": 6200 + }, + { + "epoch": 4.819557625145518, + "grad_norm": 1.0216195583343506, + "learning_rate": 0.0002, + "loss": 1.321, + "step": 6210 + }, + { + "epoch": 4.827318587504851, + "grad_norm": 0.892005980014801, + "learning_rate": 0.0002, + "loss": 1.3033, + "step": 6220 + }, + { + "epoch": 4.835079549864183, + "grad_norm": 0.9957166910171509, + "learning_rate": 0.0002, + "loss": 1.2602, + "step": 6230 + }, + { + "epoch": 4.842840512223516, + "grad_norm": 0.9720533490180969, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 6240 + }, + { + "epoch": 4.850601474582849, + "grad_norm": 0.9336182475090027, + "learning_rate": 0.0002, + "loss": 1.2651, + "step": 6250 + }, + { + "epoch": 4.858362436942181, + "grad_norm": 1.2611457109451294, + "learning_rate": 0.0002, + "loss": 1.3136, + "step": 6260 + }, + { + "epoch": 4.866123399301514, + "grad_norm": 0.8927203416824341, + "learning_rate": 0.0002, + "loss": 1.2234, + "step": 6270 + }, + { + "epoch": 4.873884361660846, + "grad_norm": 0.9706710577011108, + "learning_rate": 0.0002, + "loss": 1.3463, + "step": 6280 + }, + { + "epoch": 4.8816453240201785, + "grad_norm": 1.1461690664291382, + "learning_rate": 0.0002, + "loss": 1.3209, + "step": 6290 + }, + { + "epoch": 4.889406286379511, + "grad_norm": 0.9930381178855896, + "learning_rate": 0.0002, + "loss": 1.2566, + "step": 6300 + }, + { + "epoch": 4.897167248738843, + "grad_norm": 0.91451096534729, + "learning_rate": 0.0002, + "loss": 1.2568, + "step": 6310 + }, + { + "epoch": 4.904928211098176, + "grad_norm": 1.0319571495056152, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 6320 + }, + { + "epoch": 4.912689173457509, + "grad_norm": 0.990140438079834, + "learning_rate": 0.0002, + "loss": 1.2908, + "step": 6330 + }, + { + "epoch": 4.920450135816841, + "grad_norm": 1.2466117143630981, + "learning_rate": 0.0002, + "loss": 1.3299, + "step": 6340 + }, + { + "epoch": 4.928211098176174, + "grad_norm": 1.0316979885101318, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 6350 + }, + { + "epoch": 4.935972060535507, + "grad_norm": 1.0643759965896606, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 6360 + }, + { + "epoch": 4.943733022894839, + "grad_norm": 0.9703279733657837, + "learning_rate": 0.0002, + "loss": 1.2559, + "step": 6370 + }, + { + "epoch": 4.951493985254172, + "grad_norm": 0.9767927527427673, + "learning_rate": 0.0002, + "loss": 1.2155, + "step": 6380 + }, + { + "epoch": 4.959254947613504, + "grad_norm": 0.960854172706604, + "learning_rate": 0.0002, + "loss": 1.2437, + "step": 6390 + }, + { + "epoch": 4.967015909972837, + "grad_norm": 0.9922910332679749, + "learning_rate": 0.0002, + "loss": 1.3314, + "step": 6400 + }, + { + "epoch": 4.9747768723321695, + "grad_norm": 0.956470787525177, + "learning_rate": 0.0002, + "loss": 1.3018, + "step": 6410 + }, + { + "epoch": 4.9825378346915015, + "grad_norm": 0.9637242555618286, + "learning_rate": 0.0002, + "loss": 1.2794, + "step": 6420 + }, + { + "epoch": 4.990298797050834, + "grad_norm": 1.0855202674865723, + "learning_rate": 0.0002, + "loss": 1.3236, + "step": 6430 + }, + { + "epoch": 4.998059759410167, + "grad_norm": 0.9655316472053528, + "learning_rate": 0.0002, + "loss": 1.3015, + "step": 6440 + }, + { + "epoch": 4.9996119518820334, + "eval_loss": 2.0410802364349365, + "eval_runtime": 113.04, + "eval_samples_per_second": 4.485, + "eval_steps_per_second": 0.566, + "step": 6442 + }, + { + "epoch": 5.005820721769499, + "grad_norm": 1.1676199436187744, + "learning_rate": 0.0002, + "loss": 1.0846, + "step": 6450 + }, + { + "epoch": 5.013581684128832, + "grad_norm": 1.4317965507507324, + "learning_rate": 0.0002, + "loss": 1.041, + "step": 6460 + }, + { + "epoch": 5.021342646488165, + "grad_norm": 1.460443377494812, + "learning_rate": 0.0002, + "loss": 0.9546, + "step": 6470 + }, + { + "epoch": 5.029103608847497, + "grad_norm": 1.2299214601516724, + "learning_rate": 0.0002, + "loss": 1.0014, + "step": 6480 + }, + { + "epoch": 5.03686457120683, + "grad_norm": 1.3125724792480469, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 6490 + }, + { + "epoch": 5.044625533566162, + "grad_norm": 1.1252319812774658, + "learning_rate": 0.0002, + "loss": 1.0134, + "step": 6500 + }, + { + "epoch": 5.052386495925495, + "grad_norm": 0.9970866441726685, + "learning_rate": 0.0002, + "loss": 0.976, + "step": 6510 + }, + { + "epoch": 5.060147458284828, + "grad_norm": 1.229069709777832, + "learning_rate": 0.0002, + "loss": 0.9731, + "step": 6520 + }, + { + "epoch": 5.06790842064416, + "grad_norm": 1.2430938482284546, + "learning_rate": 0.0002, + "loss": 1.0498, + "step": 6530 + }, + { + "epoch": 5.0756693830034925, + "grad_norm": 1.0522737503051758, + "learning_rate": 0.0002, + "loss": 1.0236, + "step": 6540 + }, + { + "epoch": 5.083430345362825, + "grad_norm": 1.108890175819397, + "learning_rate": 0.0002, + "loss": 1.0221, + "step": 6550 + }, + { + "epoch": 5.091191307722157, + "grad_norm": 1.156912922859192, + "learning_rate": 0.0002, + "loss": 1.0177, + "step": 6560 + }, + { + "epoch": 5.09895227008149, + "grad_norm": 1.405895709991455, + "learning_rate": 0.0002, + "loss": 1.0415, + "step": 6570 + }, + { + "epoch": 5.106713232440823, + "grad_norm": 1.2005155086517334, + "learning_rate": 0.0002, + "loss": 0.9811, + "step": 6580 + }, + { + "epoch": 5.114474194800155, + "grad_norm": 1.181443452835083, + "learning_rate": 0.0002, + "loss": 0.9862, + "step": 6590 + }, + { + "epoch": 5.122235157159488, + "grad_norm": 2.3444771766662598, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 6600 + }, + { + "epoch": 5.12999611951882, + "grad_norm": 1.216988444328308, + "learning_rate": 0.0002, + "loss": 1.0455, + "step": 6610 + }, + { + "epoch": 5.137757081878153, + "grad_norm": 1.369553565979004, + "learning_rate": 0.0002, + "loss": 1.0549, + "step": 6620 + }, + { + "epoch": 5.145518044237486, + "grad_norm": 1.177964687347412, + "learning_rate": 0.0002, + "loss": 1.0056, + "step": 6630 + }, + { + "epoch": 5.153279006596818, + "grad_norm": 1.1397041082382202, + "learning_rate": 0.0002, + "loss": 1.1025, + "step": 6640 + }, + { + "epoch": 5.161039968956151, + "grad_norm": 1.3976861238479614, + "learning_rate": 0.0002, + "loss": 1.0437, + "step": 6650 + }, + { + "epoch": 5.1688009313154835, + "grad_norm": 1.4824495315551758, + "learning_rate": 0.0002, + "loss": 1.0454, + "step": 6660 + }, + { + "epoch": 5.1765618936748155, + "grad_norm": 1.2653018236160278, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 6670 + }, + { + "epoch": 5.184322856034148, + "grad_norm": 1.3106069564819336, + "learning_rate": 0.0002, + "loss": 0.9971, + "step": 6680 + }, + { + "epoch": 5.192083818393481, + "grad_norm": 1.3140279054641724, + "learning_rate": 0.0002, + "loss": 1.0561, + "step": 6690 + }, + { + "epoch": 5.199844780752813, + "grad_norm": 1.3900256156921387, + "learning_rate": 0.0002, + "loss": 1.0618, + "step": 6700 + }, + { + "epoch": 5.207605743112146, + "grad_norm": 1.3191124200820923, + "learning_rate": 0.0002, + "loss": 1.0285, + "step": 6710 + }, + { + "epoch": 5.215366705471478, + "grad_norm": 1.176107406616211, + "learning_rate": 0.0002, + "loss": 0.9921, + "step": 6720 + }, + { + "epoch": 5.223127667830811, + "grad_norm": 1.2364883422851562, + "learning_rate": 0.0002, + "loss": 1.064, + "step": 6730 + }, + { + "epoch": 5.230888630190144, + "grad_norm": 1.343022108078003, + "learning_rate": 0.0002, + "loss": 0.9599, + "step": 6740 + }, + { + "epoch": 5.238649592549476, + "grad_norm": 1.2826898097991943, + "learning_rate": 0.0002, + "loss": 1.0342, + "step": 6750 + }, + { + "epoch": 5.246410554908809, + "grad_norm": 1.500257134437561, + "learning_rate": 0.0002, + "loss": 1.0703, + "step": 6760 + }, + { + "epoch": 5.254171517268142, + "grad_norm": 1.2605743408203125, + "learning_rate": 0.0002, + "loss": 1.0114, + "step": 6770 + }, + { + "epoch": 5.261932479627474, + "grad_norm": 1.2355525493621826, + "learning_rate": 0.0002, + "loss": 1.0825, + "step": 6780 + }, + { + "epoch": 5.2696934419868064, + "grad_norm": 1.2845789194107056, + "learning_rate": 0.0002, + "loss": 1.0436, + "step": 6790 + }, + { + "epoch": 5.277454404346139, + "grad_norm": 1.3696625232696533, + "learning_rate": 0.0002, + "loss": 0.989, + "step": 6800 + }, + { + "epoch": 5.285215366705471, + "grad_norm": 1.4051260948181152, + "learning_rate": 0.0002, + "loss": 1.0991, + "step": 6810 + }, + { + "epoch": 5.292976329064804, + "grad_norm": 1.266725778579712, + "learning_rate": 0.0002, + "loss": 1.0987, + "step": 6820 + }, + { + "epoch": 5.300737291424136, + "grad_norm": 1.3475236892700195, + "learning_rate": 0.0002, + "loss": 1.0489, + "step": 6830 + }, + { + "epoch": 5.308498253783469, + "grad_norm": 1.54409921169281, + "learning_rate": 0.0002, + "loss": 1.0264, + "step": 6840 + }, + { + "epoch": 5.316259216142802, + "grad_norm": 1.2391985654830933, + "learning_rate": 0.0002, + "loss": 1.033, + "step": 6850 + }, + { + "epoch": 5.324020178502134, + "grad_norm": 1.2435699701309204, + "learning_rate": 0.0002, + "loss": 1.1058, + "step": 6860 + }, + { + "epoch": 5.331781140861467, + "grad_norm": 1.8803037405014038, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 6870 + }, + { + "epoch": 5.3395421032208, + "grad_norm": 1.4195542335510254, + "learning_rate": 0.0002, + "loss": 0.997, + "step": 6880 + }, + { + "epoch": 5.347303065580132, + "grad_norm": 1.1853394508361816, + "learning_rate": 0.0002, + "loss": 1.0273, + "step": 6890 + }, + { + "epoch": 5.355064027939465, + "grad_norm": 1.4016530513763428, + "learning_rate": 0.0002, + "loss": 1.0668, + "step": 6900 + }, + { + "epoch": 5.3628249902987974, + "grad_norm": 1.294339895248413, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 6910 + }, + { + "epoch": 5.370585952658129, + "grad_norm": 1.2952708005905151, + "learning_rate": 0.0002, + "loss": 1.0724, + "step": 6920 + }, + { + "epoch": 5.378346915017462, + "grad_norm": 1.1361510753631592, + "learning_rate": 0.0002, + "loss": 1.0098, + "step": 6930 + }, + { + "epoch": 5.386107877376794, + "grad_norm": 1.125805377960205, + "learning_rate": 0.0002, + "loss": 1.0796, + "step": 6940 + }, + { + "epoch": 5.393868839736127, + "grad_norm": 1.1453300714492798, + "learning_rate": 0.0002, + "loss": 1.122, + "step": 6950 + }, + { + "epoch": 5.40162980209546, + "grad_norm": 1.4542768001556396, + "learning_rate": 0.0002, + "loss": 1.0977, + "step": 6960 + }, + { + "epoch": 5.409390764454792, + "grad_norm": 1.2360988855361938, + "learning_rate": 0.0002, + "loss": 1.0825, + "step": 6970 + }, + { + "epoch": 5.417151726814125, + "grad_norm": 1.2182754278182983, + "learning_rate": 0.0002, + "loss": 1.0631, + "step": 6980 + }, + { + "epoch": 5.424912689173458, + "grad_norm": 1.2018693685531616, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 6990 + }, + { + "epoch": 5.43267365153279, + "grad_norm": 1.346124291419983, + "learning_rate": 0.0002, + "loss": 1.108, + "step": 7000 + }, + { + "epoch": 5.440434613892123, + "grad_norm": 1.2534189224243164, + "learning_rate": 0.0002, + "loss": 1.0534, + "step": 7010 + }, + { + "epoch": 5.448195576251456, + "grad_norm": 1.2033339738845825, + "learning_rate": 0.0002, + "loss": 1.0696, + "step": 7020 + }, + { + "epoch": 5.4559565386107876, + "grad_norm": 1.2788134813308716, + "learning_rate": 0.0002, + "loss": 1.0714, + "step": 7030 + }, + { + "epoch": 5.46371750097012, + "grad_norm": 1.2751542329788208, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 7040 + }, + { + "epoch": 5.471478463329452, + "grad_norm": 1.3237019777297974, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 7050 + }, + { + "epoch": 5.479239425688785, + "grad_norm": 1.4932852983474731, + "learning_rate": 0.0002, + "loss": 1.1081, + "step": 7060 + }, + { + "epoch": 5.487000388048118, + "grad_norm": 1.4003876447677612, + "learning_rate": 0.0002, + "loss": 1.0197, + "step": 7070 + }, + { + "epoch": 5.49476135040745, + "grad_norm": 1.404799461364746, + "learning_rate": 0.0002, + "loss": 1.0662, + "step": 7080 + }, + { + "epoch": 5.502522312766783, + "grad_norm": 1.4486982822418213, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 7090 + }, + { + "epoch": 5.510283275126116, + "grad_norm": 1.1713480949401855, + "learning_rate": 0.0002, + "loss": 1.0645, + "step": 7100 + }, + { + "epoch": 5.518044237485448, + "grad_norm": 1.4062601327896118, + "learning_rate": 0.0002, + "loss": 1.006, + "step": 7110 + }, + { + "epoch": 5.525805199844781, + "grad_norm": 1.211629867553711, + "learning_rate": 0.0002, + "loss": 1.0459, + "step": 7120 + }, + { + "epoch": 5.533566162204114, + "grad_norm": 1.2523176670074463, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 7130 + }, + { + "epoch": 5.541327124563446, + "grad_norm": 1.4467198848724365, + "learning_rate": 0.0002, + "loss": 1.1132, + "step": 7140 + }, + { + "epoch": 5.5490880869227786, + "grad_norm": 1.5961614847183228, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 7150 + }, + { + "epoch": 5.5568490492821105, + "grad_norm": 1.320656418800354, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 7160 + }, + { + "epoch": 5.564610011641443, + "grad_norm": 1.2423332929611206, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 7170 + }, + { + "epoch": 5.572370974000776, + "grad_norm": 1.2919669151306152, + "learning_rate": 0.0002, + "loss": 1.0046, + "step": 7180 + }, + { + "epoch": 5.580131936360108, + "grad_norm": 1.1678385734558105, + "learning_rate": 0.0002, + "loss": 1.046, + "step": 7190 + }, + { + "epoch": 5.587892898719441, + "grad_norm": 1.4250764846801758, + "learning_rate": 0.0002, + "loss": 1.1011, + "step": 7200 + }, + { + "epoch": 5.595653861078774, + "grad_norm": 1.5308716297149658, + "learning_rate": 0.0002, + "loss": 1.1254, + "step": 7210 + }, + { + "epoch": 5.603414823438106, + "grad_norm": 1.2678815126419067, + "learning_rate": 0.0002, + "loss": 1.121, + "step": 7220 + }, + { + "epoch": 5.611175785797439, + "grad_norm": 1.127856969833374, + "learning_rate": 0.0002, + "loss": 1.0846, + "step": 7230 + }, + { + "epoch": 5.618936748156772, + "grad_norm": 1.3832560777664185, + "learning_rate": 0.0002, + "loss": 1.0647, + "step": 7240 + }, + { + "epoch": 5.626697710516104, + "grad_norm": 1.3226919174194336, + "learning_rate": 0.0002, + "loss": 1.0658, + "step": 7250 + }, + { + "epoch": 5.634458672875437, + "grad_norm": 1.3418006896972656, + "learning_rate": 0.0002, + "loss": 1.1175, + "step": 7260 + }, + { + "epoch": 5.642219635234769, + "grad_norm": 1.2625300884246826, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 7270 + }, + { + "epoch": 5.6499805975941015, + "grad_norm": 1.1579464673995972, + "learning_rate": 0.0002, + "loss": 1.067, + "step": 7280 + }, + { + "epoch": 5.657741559953434, + "grad_norm": 1.4998650550842285, + "learning_rate": 0.0002, + "loss": 1.0447, + "step": 7290 + }, + { + "epoch": 5.665502522312766, + "grad_norm": 1.2670758962631226, + "learning_rate": 0.0002, + "loss": 1.1256, + "step": 7300 + }, + { + "epoch": 5.673263484672099, + "grad_norm": 1.2959760427474976, + "learning_rate": 0.0002, + "loss": 1.1267, + "step": 7310 + }, + { + "epoch": 5.681024447031432, + "grad_norm": 1.2460671663284302, + "learning_rate": 0.0002, + "loss": 1.1387, + "step": 7320 + }, + { + "epoch": 5.688785409390764, + "grad_norm": 1.1313989162445068, + "learning_rate": 0.0002, + "loss": 1.0756, + "step": 7330 + }, + { + "epoch": 5.696546371750097, + "grad_norm": 1.282527208328247, + "learning_rate": 0.0002, + "loss": 1.0618, + "step": 7340 + }, + { + "epoch": 5.70430733410943, + "grad_norm": 1.3380206823349, + "learning_rate": 0.0002, + "loss": 1.1315, + "step": 7350 + }, + { + "epoch": 5.712068296468762, + "grad_norm": 1.1648279428482056, + "learning_rate": 0.0002, + "loss": 1.0949, + "step": 7360 + }, + { + "epoch": 5.719829258828095, + "grad_norm": 1.3059816360473633, + "learning_rate": 0.0002, + "loss": 1.1705, + "step": 7370 + }, + { + "epoch": 5.727590221187427, + "grad_norm": 1.1905046701431274, + "learning_rate": 0.0002, + "loss": 1.1496, + "step": 7380 + }, + { + "epoch": 5.73535118354676, + "grad_norm": 1.4089630842208862, + "learning_rate": 0.0002, + "loss": 1.1356, + "step": 7390 + }, + { + "epoch": 5.7431121459060925, + "grad_norm": 1.256721019744873, + "learning_rate": 0.0002, + "loss": 1.1349, + "step": 7400 + }, + { + "epoch": 5.7508731082654245, + "grad_norm": 1.1915162801742554, + "learning_rate": 0.0002, + "loss": 1.0682, + "step": 7410 + }, + { + "epoch": 5.758634070624757, + "grad_norm": 1.1935480833053589, + "learning_rate": 0.0002, + "loss": 1.1257, + "step": 7420 + }, + { + "epoch": 5.76639503298409, + "grad_norm": 1.1761008501052856, + "learning_rate": 0.0002, + "loss": 1.1348, + "step": 7430 + }, + { + "epoch": 5.774155995343422, + "grad_norm": 1.2540549039840698, + "learning_rate": 0.0002, + "loss": 1.0837, + "step": 7440 + }, + { + "epoch": 5.781916957702755, + "grad_norm": 1.5295120477676392, + "learning_rate": 0.0002, + "loss": 1.1527, + "step": 7450 + }, + { + "epoch": 5.789677920062088, + "grad_norm": 1.1081160306930542, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 7460 + }, + { + "epoch": 5.79743888242142, + "grad_norm": 1.4381253719329834, + "learning_rate": 0.0002, + "loss": 1.1304, + "step": 7470 + }, + { + "epoch": 5.805199844780753, + "grad_norm": 1.3079341650009155, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 7480 + }, + { + "epoch": 5.812960807140085, + "grad_norm": 1.1372792720794678, + "learning_rate": 0.0002, + "loss": 1.0544, + "step": 7490 + }, + { + "epoch": 5.820721769499418, + "grad_norm": 1.3221744298934937, + "learning_rate": 0.0002, + "loss": 1.1622, + "step": 7500 + }, + { + "epoch": 5.828482731858751, + "grad_norm": 1.3436939716339111, + "learning_rate": 0.0002, + "loss": 1.1515, + "step": 7510 + }, + { + "epoch": 5.8362436942180835, + "grad_norm": 1.3916879892349243, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 7520 + }, + { + "epoch": 5.8440046565774155, + "grad_norm": 1.2463704347610474, + "learning_rate": 0.0002, + "loss": 1.0816, + "step": 7530 + }, + { + "epoch": 5.851765618936748, + "grad_norm": 1.097051739692688, + "learning_rate": 0.0002, + "loss": 1.0745, + "step": 7540 + }, + { + "epoch": 5.85952658129608, + "grad_norm": 1.1554739475250244, + "learning_rate": 0.0002, + "loss": 1.1454, + "step": 7550 + }, + { + "epoch": 5.867287543655413, + "grad_norm": 1.2384694814682007, + "learning_rate": 0.0002, + "loss": 1.0953, + "step": 7560 + }, + { + "epoch": 5.875048506014746, + "grad_norm": 1.142815351486206, + "learning_rate": 0.0002, + "loss": 1.1734, + "step": 7570 + }, + { + "epoch": 5.882809468374078, + "grad_norm": 1.3637062311172485, + "learning_rate": 0.0002, + "loss": 1.162, + "step": 7580 + }, + { + "epoch": 5.890570430733411, + "grad_norm": 1.2449073791503906, + "learning_rate": 0.0002, + "loss": 1.0781, + "step": 7590 + }, + { + "epoch": 5.898331393092743, + "grad_norm": 1.358058214187622, + "learning_rate": 0.0002, + "loss": 1.1191, + "step": 7600 + }, + { + "epoch": 5.906092355452076, + "grad_norm": 1.264655351638794, + "learning_rate": 0.0002, + "loss": 1.0779, + "step": 7610 + }, + { + "epoch": 5.913853317811409, + "grad_norm": 1.3186019659042358, + "learning_rate": 0.0002, + "loss": 1.1538, + "step": 7620 + }, + { + "epoch": 5.921614280170742, + "grad_norm": 1.4111460447311401, + "learning_rate": 0.0002, + "loss": 1.1076, + "step": 7630 + }, + { + "epoch": 5.929375242530074, + "grad_norm": 1.1078972816467285, + "learning_rate": 0.0002, + "loss": 1.1765, + "step": 7640 + }, + { + "epoch": 5.9371362048894065, + "grad_norm": 1.2742213010787964, + "learning_rate": 0.0002, + "loss": 1.1305, + "step": 7650 + }, + { + "epoch": 5.9448971672487385, + "grad_norm": 1.3412781953811646, + "learning_rate": 0.0002, + "loss": 1.144, + "step": 7660 + }, + { + "epoch": 5.952658129608071, + "grad_norm": 1.123005986213684, + "learning_rate": 0.0002, + "loss": 1.1642, + "step": 7670 + }, + { + "epoch": 5.960419091967404, + "grad_norm": 1.2203444242477417, + "learning_rate": 0.0002, + "loss": 1.0732, + "step": 7680 + }, + { + "epoch": 5.968180054326736, + "grad_norm": 1.341011643409729, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 7690 + }, + { + "epoch": 5.975941016686069, + "grad_norm": 1.2689454555511475, + "learning_rate": 0.0002, + "loss": 1.1144, + "step": 7700 + }, + { + "epoch": 5.983701979045401, + "grad_norm": 1.1518112421035767, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 7710 + }, + { + "epoch": 5.991462941404734, + "grad_norm": 1.3698320388793945, + "learning_rate": 0.0002, + "loss": 1.1868, + "step": 7720 + }, + { + "epoch": 5.999223903764067, + "grad_norm": 1.2812788486480713, + "learning_rate": 0.0002, + "loss": 1.0651, + "step": 7730 + }, + { + "epoch": 6.0, + "eval_loss": 2.252762794494629, + "eval_runtime": 114.8471, + "eval_samples_per_second": 4.415, + "eval_steps_per_second": 0.557, + "step": 7731 + } + ], + "logging_steps": 10, + "max_steps": 10304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.9709311745130496e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8ca6e2c3ac58fa2af9f99747566f932f41a5a4d5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-7731/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f7feb06ff53d5bf79374054a25b662309e705a2ca08dfa3b0bce7b8b4632fae +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..332e89924059813e5123e5202e68195f613696ea --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc1031f2b40f599415c82b304b1942ecafef76a9222d3be72dcb17285197c566 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e68c07c24b855908a4caf1ec5ed18fcd10eff2e9 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0e4e2adac8c101bbb66ef23229310b1ce252a5375c7c85126c484fc3582cb29 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..fe38c419ee36480fe8ade37f82697fddcec3abda --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70c3bb1cda34731925ef8ba2681db4519973fb310959491b41b809191c7954c5 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..903fde5a3bc95e50951322f409e90b32a9316276 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2486a6be3f3aae5c8d316423c16cad160bed976bd968bd54f8c859cc457b20c +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..af662405c44612d20d5861526d5f22c5e1feb8d9 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/trainer_state.json @@ -0,0 +1,6396 @@ +{ + "best_metric": 1.8068748712539673, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577", + "epoch": 6.9996119518820334, + "eval_steps": 10, + "global_step": 9019, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007760962359332557, + "grad_norm": 1.0751162767410278, + "learning_rate": 0.0002, + "loss": 3.0855, + "step": 10 + }, + { + "epoch": 0.015521924718665115, + "grad_norm": 0.4697345793247223, + "learning_rate": 0.0002, + "loss": 2.4744, + "step": 20 + }, + { + "epoch": 0.023282887077997673, + "grad_norm": 0.5370839238166809, + "learning_rate": 0.0002, + "loss": 2.193, + "step": 30 + }, + { + "epoch": 0.03104384943733023, + "grad_norm": 0.46794816851615906, + "learning_rate": 0.0002, + "loss": 2.0599, + "step": 40 + }, + { + "epoch": 0.038804811796662786, + "grad_norm": 0.44624820351600647, + "learning_rate": 0.0002, + "loss": 1.9354, + "step": 50 + }, + { + "epoch": 0.046565774155995346, + "grad_norm": 0.3953201472759247, + "learning_rate": 0.0002, + "loss": 1.9319, + "step": 60 + }, + { + "epoch": 0.0543267365153279, + "grad_norm": 0.3935912549495697, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 70 + }, + { + "epoch": 0.06208769887466046, + "grad_norm": 0.4520699381828308, + "learning_rate": 0.0002, + "loss": 1.8795, + "step": 80 + }, + { + "epoch": 0.06984866123399301, + "grad_norm": 0.3801847994327545, + "learning_rate": 0.0002, + "loss": 1.8354, + "step": 90 + }, + { + "epoch": 0.07760962359332557, + "grad_norm": 0.4020165205001831, + "learning_rate": 0.0002, + "loss": 1.9053, + "step": 100 + }, + { + "epoch": 0.08537058595265813, + "grad_norm": 0.3860672116279602, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 110 + }, + { + "epoch": 0.09313154831199069, + "grad_norm": 0.3681113123893738, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 120 + }, + { + "epoch": 0.10089251067132324, + "grad_norm": 0.3594866991043091, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 130 + }, + { + "epoch": 0.1086534730306558, + "grad_norm": 0.3879193663597107, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 140 + }, + { + "epoch": 0.11641443538998836, + "grad_norm": 0.3270505666732788, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 150 + }, + { + "epoch": 0.12417539774932092, + "grad_norm": 0.36824458837509155, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 160 + }, + { + "epoch": 0.13193636010865348, + "grad_norm": 0.383882075548172, + "learning_rate": 0.0002, + "loss": 1.8305, + "step": 170 + }, + { + "epoch": 0.13969732246798602, + "grad_norm": 0.3368665874004364, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 180 + }, + { + "epoch": 0.1474582848273186, + "grad_norm": 0.35961097478866577, + "learning_rate": 0.0002, + "loss": 1.7882, + "step": 190 + }, + { + "epoch": 0.15521924718665114, + "grad_norm": 0.3415963351726532, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 200 + }, + { + "epoch": 0.1629802095459837, + "grad_norm": 0.4100632071495056, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 210 + }, + { + "epoch": 0.17074117190531626, + "grad_norm": 0.3516307473182678, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 220 + }, + { + "epoch": 0.1785021342646488, + "grad_norm": 0.37919050455093384, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 230 + }, + { + "epoch": 0.18626309662398138, + "grad_norm": 0.33270683884620667, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 240 + }, + { + "epoch": 0.19402405898331393, + "grad_norm": 0.3348783254623413, + "learning_rate": 0.0002, + "loss": 1.7925, + "step": 250 + }, + { + "epoch": 0.20178502134264648, + "grad_norm": 0.3888475298881531, + "learning_rate": 0.0002, + "loss": 1.7774, + "step": 260 + }, + { + "epoch": 0.20954598370197905, + "grad_norm": 0.3554602861404419, + "learning_rate": 0.0002, + "loss": 1.8381, + "step": 270 + }, + { + "epoch": 0.2173069460613116, + "grad_norm": 0.33277708292007446, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 280 + }, + { + "epoch": 0.22506790842064417, + "grad_norm": 0.3281584680080414, + "learning_rate": 0.0002, + "loss": 1.7713, + "step": 290 + }, + { + "epoch": 0.23282887077997672, + "grad_norm": 0.3185969591140747, + "learning_rate": 0.0002, + "loss": 1.8181, + "step": 300 + }, + { + "epoch": 0.24058983313930926, + "grad_norm": 0.35335442423820496, + "learning_rate": 0.0002, + "loss": 1.8595, + "step": 310 + }, + { + "epoch": 0.24835079549864184, + "grad_norm": 0.3119595944881439, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 320 + }, + { + "epoch": 0.2561117578579744, + "grad_norm": 0.36424458026885986, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 330 + }, + { + "epoch": 0.26387272021730696, + "grad_norm": 0.3618951141834259, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 340 + }, + { + "epoch": 0.2716336825766395, + "grad_norm": 0.312757670879364, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 350 + }, + { + "epoch": 0.27939464493597205, + "grad_norm": 0.326016366481781, + "learning_rate": 0.0002, + "loss": 1.9031, + "step": 360 + }, + { + "epoch": 0.2871556072953046, + "grad_norm": 0.34093883633613586, + "learning_rate": 0.0002, + "loss": 1.8214, + "step": 370 + }, + { + "epoch": 0.2949165696546372, + "grad_norm": 0.32325029373168945, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 380 + }, + { + "epoch": 0.30267753201396974, + "grad_norm": 0.34105437994003296, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 390 + }, + { + "epoch": 0.3104384943733023, + "grad_norm": 0.32565295696258545, + "learning_rate": 0.0002, + "loss": 1.7926, + "step": 400 + }, + { + "epoch": 0.31819945673263483, + "grad_norm": 0.32742050290107727, + "learning_rate": 0.0002, + "loss": 1.8031, + "step": 410 + }, + { + "epoch": 0.3259604190919674, + "grad_norm": 0.30233046412467957, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 420 + }, + { + "epoch": 0.3337213814513, + "grad_norm": 0.32419222593307495, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 430 + }, + { + "epoch": 0.3414823438106325, + "grad_norm": 0.3653007745742798, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 440 + }, + { + "epoch": 0.3492433061699651, + "grad_norm": 0.31617099046707153, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 450 + }, + { + "epoch": 0.3570042685292976, + "grad_norm": 0.3305962085723877, + "learning_rate": 0.0002, + "loss": 1.7677, + "step": 460 + }, + { + "epoch": 0.36476523088863017, + "grad_norm": 0.3178933262825012, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 470 + }, + { + "epoch": 0.37252619324796277, + "grad_norm": 0.37163782119750977, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 480 + }, + { + "epoch": 0.3802871556072953, + "grad_norm": 0.469844788312912, + "learning_rate": 0.0002, + "loss": 1.8804, + "step": 490 + }, + { + "epoch": 0.38804811796662786, + "grad_norm": 0.3409338593482971, + "learning_rate": 0.0002, + "loss": 1.8343, + "step": 500 + }, + { + "epoch": 0.3958090803259604, + "grad_norm": 0.31943467259407043, + "learning_rate": 0.0002, + "loss": 1.8433, + "step": 510 + }, + { + "epoch": 0.40357004268529295, + "grad_norm": 0.32293614745140076, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 520 + }, + { + "epoch": 0.41133100504462555, + "grad_norm": 0.2994382977485657, + "learning_rate": 0.0002, + "loss": 1.8584, + "step": 530 + }, + { + "epoch": 0.4190919674039581, + "grad_norm": 0.3273141384124756, + "learning_rate": 0.0002, + "loss": 1.8153, + "step": 540 + }, + { + "epoch": 0.42685292976329064, + "grad_norm": 0.3020550012588501, + "learning_rate": 0.0002, + "loss": 1.8097, + "step": 550 + }, + { + "epoch": 0.4346138921226232, + "grad_norm": 0.30113112926483154, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 560 + }, + { + "epoch": 0.44237485448195574, + "grad_norm": 0.30274903774261475, + "learning_rate": 0.0002, + "loss": 1.8084, + "step": 570 + }, + { + "epoch": 0.45013581684128834, + "grad_norm": 0.3231128454208374, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 580 + }, + { + "epoch": 0.4578967792006209, + "grad_norm": 0.3255121409893036, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 590 + }, + { + "epoch": 0.46565774155995343, + "grad_norm": 0.30147507786750793, + "learning_rate": 0.0002, + "loss": 1.8227, + "step": 600 + }, + { + "epoch": 0.473418703919286, + "grad_norm": 0.29781386256217957, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 610 + }, + { + "epoch": 0.4811796662786185, + "grad_norm": 0.30914685130119324, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 620 + }, + { + "epoch": 0.4889406286379511, + "grad_norm": 0.3110593855381012, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 630 + }, + { + "epoch": 0.49670159099728367, + "grad_norm": 0.3298132121562958, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 640 + }, + { + "epoch": 0.5044625533566163, + "grad_norm": 0.322122186422348, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 650 + }, + { + "epoch": 0.5122235157159488, + "grad_norm": 0.3504371643066406, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 660 + }, + { + "epoch": 0.5199844780752814, + "grad_norm": 0.3102182149887085, + "learning_rate": 0.0002, + "loss": 1.8682, + "step": 670 + }, + { + "epoch": 0.5277454404346139, + "grad_norm": 0.6113658547401428, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 680 + }, + { + "epoch": 0.5355064027939465, + "grad_norm": 0.31841862201690674, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 690 + }, + { + "epoch": 0.543267365153279, + "grad_norm": 0.2830526530742645, + "learning_rate": 0.0002, + "loss": 1.7587, + "step": 700 + }, + { + "epoch": 0.5510283275126115, + "grad_norm": 0.3048769533634186, + "learning_rate": 0.0002, + "loss": 1.7887, + "step": 710 + }, + { + "epoch": 0.5587892898719441, + "grad_norm": 0.2719033658504486, + "learning_rate": 0.0002, + "loss": 1.8416, + "step": 720 + }, + { + "epoch": 0.5665502522312766, + "grad_norm": 0.3176722526550293, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 730 + }, + { + "epoch": 0.5743112145906092, + "grad_norm": 0.32491734623908997, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 740 + }, + { + "epoch": 0.5820721769499418, + "grad_norm": 0.32746851444244385, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 750 + }, + { + "epoch": 0.5898331393092744, + "grad_norm": 0.3055773973464966, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 760 + }, + { + "epoch": 0.5975941016686069, + "grad_norm": 0.30671584606170654, + "learning_rate": 0.0002, + "loss": 1.8597, + "step": 770 + }, + { + "epoch": 0.6053550640279395, + "grad_norm": 0.28770264983177185, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 780 + }, + { + "epoch": 0.613116026387272, + "grad_norm": 0.2814285457134247, + "learning_rate": 0.0002, + "loss": 1.7025, + "step": 790 + }, + { + "epoch": 0.6208769887466046, + "grad_norm": 0.31554412841796875, + "learning_rate": 0.0002, + "loss": 1.819, + "step": 800 + }, + { + "epoch": 0.6286379511059371, + "grad_norm": 0.2984226942062378, + "learning_rate": 0.0002, + "loss": 1.8335, + "step": 810 + }, + { + "epoch": 0.6363989134652697, + "grad_norm": 0.2859906554222107, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 820 + }, + { + "epoch": 0.6441598758246022, + "grad_norm": 0.2887928783893585, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 830 + }, + { + "epoch": 0.6519208381839348, + "grad_norm": 0.31287339329719543, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 840 + }, + { + "epoch": 0.6596818005432674, + "grad_norm": 0.32064181566238403, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 850 + }, + { + "epoch": 0.6674427629026, + "grad_norm": 0.290981650352478, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 860 + }, + { + "epoch": 0.6752037252619325, + "grad_norm": 0.33060121536254883, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 870 + }, + { + "epoch": 0.682964687621265, + "grad_norm": 0.27032899856567383, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 880 + }, + { + "epoch": 0.6907256499805976, + "grad_norm": 0.29031234979629517, + "learning_rate": 0.0002, + "loss": 1.8423, + "step": 890 + }, + { + "epoch": 0.6984866123399301, + "grad_norm": 0.2845142185688019, + "learning_rate": 0.0002, + "loss": 1.835, + "step": 900 + }, + { + "epoch": 0.7062475746992627, + "grad_norm": 0.8638312816619873, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 910 + }, + { + "epoch": 0.7140085370585952, + "grad_norm": 0.3086668848991394, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 920 + }, + { + "epoch": 0.7217694994179278, + "grad_norm": 0.2724177837371826, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 930 + }, + { + "epoch": 0.7295304617772603, + "grad_norm": 0.289559006690979, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 940 + }, + { + "epoch": 0.737291424136593, + "grad_norm": 0.3000658452510834, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 950 + }, + { + "epoch": 0.7450523864959255, + "grad_norm": 0.33544042706489563, + "learning_rate": 0.0002, + "loss": 1.7736, + "step": 960 + }, + { + "epoch": 0.7528133488552581, + "grad_norm": 0.28593236207962036, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 970 + }, + { + "epoch": 0.7605743112145906, + "grad_norm": 0.313634991645813, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 980 + }, + { + "epoch": 0.7683352735739232, + "grad_norm": 0.2949385941028595, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 990 + }, + { + "epoch": 0.7760962359332557, + "grad_norm": 0.2920108437538147, + "learning_rate": 0.0002, + "loss": 1.8689, + "step": 1000 + }, + { + "epoch": 0.7838571982925883, + "grad_norm": 0.3245100677013397, + "learning_rate": 0.0002, + "loss": 1.8401, + "step": 1010 + }, + { + "epoch": 0.7916181606519208, + "grad_norm": 0.3007619380950928, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 1020 + }, + { + "epoch": 0.7993791230112534, + "grad_norm": 0.3630852997303009, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 1030 + }, + { + "epoch": 0.8071400853705859, + "grad_norm": 0.2856379747390747, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 1040 + }, + { + "epoch": 0.8149010477299186, + "grad_norm": 0.32476478815078735, + "learning_rate": 0.0002, + "loss": 1.8371, + "step": 1050 + }, + { + "epoch": 0.8226620100892511, + "grad_norm": 0.5162565112113953, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 1060 + }, + { + "epoch": 0.8304229724485837, + "grad_norm": 0.316496342420578, + "learning_rate": 0.0002, + "loss": 1.8862, + "step": 1070 + }, + { + "epoch": 0.8381839348079162, + "grad_norm": 0.31977516412734985, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 1080 + }, + { + "epoch": 0.8459448971672487, + "grad_norm": 0.269509494304657, + "learning_rate": 0.0002, + "loss": 1.8547, + "step": 1090 + }, + { + "epoch": 0.8537058595265813, + "grad_norm": 0.31621453166007996, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 1100 + }, + { + "epoch": 0.8614668218859138, + "grad_norm": 0.2946535050868988, + "learning_rate": 0.0002, + "loss": 1.739, + "step": 1110 + }, + { + "epoch": 0.8692277842452464, + "grad_norm": 0.3088909983634949, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 1120 + }, + { + "epoch": 0.8769887466045789, + "grad_norm": 0.33033716678619385, + "learning_rate": 0.0002, + "loss": 1.8228, + "step": 1130 + }, + { + "epoch": 0.8847497089639115, + "grad_norm": 0.2954833507537842, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1140 + }, + { + "epoch": 0.8925106713232441, + "grad_norm": 0.2950248122215271, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 1150 + }, + { + "epoch": 0.9002716336825767, + "grad_norm": 0.296661913394928, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 1160 + }, + { + "epoch": 0.9080325960419092, + "grad_norm": 0.35451310873031616, + "learning_rate": 0.0002, + "loss": 1.7967, + "step": 1170 + }, + { + "epoch": 0.9157935584012418, + "grad_norm": 0.32705947756767273, + "learning_rate": 0.0002, + "loss": 1.8202, + "step": 1180 + }, + { + "epoch": 0.9235545207605743, + "grad_norm": 0.3333960771560669, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1190 + }, + { + "epoch": 0.9313154831199069, + "grad_norm": 0.3042232096195221, + "learning_rate": 0.0002, + "loss": 1.7801, + "step": 1200 + }, + { + "epoch": 0.9390764454792394, + "grad_norm": 0.281553715467453, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1210 + }, + { + "epoch": 0.946837407838572, + "grad_norm": 0.3096391558647156, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 1220 + }, + { + "epoch": 0.9545983701979045, + "grad_norm": 0.2866271734237671, + "learning_rate": 0.0002, + "loss": 1.7401, + "step": 1230 + }, + { + "epoch": 0.962359332557237, + "grad_norm": 0.28394097089767456, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 1240 + }, + { + "epoch": 0.9701202949165697, + "grad_norm": 0.3249266743659973, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 1250 + }, + { + "epoch": 0.9778812572759022, + "grad_norm": 0.2896869480609894, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1260 + }, + { + "epoch": 0.9856422196352348, + "grad_norm": 0.29224586486816406, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 1270 + }, + { + "epoch": 0.9934031819945673, + "grad_norm": 0.2820223569869995, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1280 + }, + { + "epoch": 0.9996119518820333, + "eval_loss": 1.8081045150756836, + "eval_runtime": 102.3056, + "eval_samples_per_second": 4.956, + "eval_steps_per_second": 0.626, + "step": 1288 + }, + { + "epoch": 1.0011641443538999, + "grad_norm": 0.3282551169395447, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 1290 + }, + { + "epoch": 1.0089251067132325, + "grad_norm": 0.30217495560646057, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1300 + }, + { + "epoch": 1.016686069072565, + "grad_norm": 0.30801767110824585, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 1310 + }, + { + "epoch": 1.0244470314318976, + "grad_norm": 0.31816792488098145, + "learning_rate": 0.0002, + "loss": 1.7756, + "step": 1320 + }, + { + "epoch": 1.03220799379123, + "grad_norm": 0.27794334292411804, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 1330 + }, + { + "epoch": 1.0399689561505627, + "grad_norm": 0.3018926680088043, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 1340 + }, + { + "epoch": 1.0477299185098952, + "grad_norm": 0.3552975356578827, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1350 + }, + { + "epoch": 1.0554908808692278, + "grad_norm": 0.32590144872665405, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 1360 + }, + { + "epoch": 1.0632518432285603, + "grad_norm": 0.3435460925102234, + "learning_rate": 0.0002, + "loss": 1.6479, + "step": 1370 + }, + { + "epoch": 1.071012805587893, + "grad_norm": 0.35037797689437866, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 1380 + }, + { + "epoch": 1.0787737679472253, + "grad_norm": 0.31398263573646545, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 1390 + }, + { + "epoch": 1.086534730306558, + "grad_norm": 0.3134010434150696, + "learning_rate": 0.0002, + "loss": 1.6729, + "step": 1400 + }, + { + "epoch": 1.0942956926658907, + "grad_norm": 0.4599704444408417, + "learning_rate": 0.0002, + "loss": 1.751, + "step": 1410 + }, + { + "epoch": 1.102056655025223, + "grad_norm": 0.35852891206741333, + "learning_rate": 0.0002, + "loss": 1.6871, + "step": 1420 + }, + { + "epoch": 1.1098176173845558, + "grad_norm": 0.35628634691238403, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 1430 + }, + { + "epoch": 1.1175785797438882, + "grad_norm": 0.3769161105155945, + "learning_rate": 0.0002, + "loss": 1.6166, + "step": 1440 + }, + { + "epoch": 1.1253395421032208, + "grad_norm": 1.3712416887283325, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1450 + }, + { + "epoch": 1.1331005044625533, + "grad_norm": 0.38406670093536377, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 1460 + }, + { + "epoch": 1.140861466821886, + "grad_norm": 0.3402116000652313, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 1470 + }, + { + "epoch": 1.1486224291812184, + "grad_norm": 0.341189444065094, + "learning_rate": 0.0002, + "loss": 1.7074, + "step": 1480 + }, + { + "epoch": 1.156383391540551, + "grad_norm": 0.36629995703697205, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 1490 + }, + { + "epoch": 1.1641443538998835, + "grad_norm": 0.3499569296836853, + "learning_rate": 0.0002, + "loss": 1.6952, + "step": 1500 + }, + { + "epoch": 1.1719053162592161, + "grad_norm": 0.3663063943386078, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 1510 + }, + { + "epoch": 1.1796662786185488, + "grad_norm": 0.34851500391960144, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1520 + }, + { + "epoch": 1.1874272409778812, + "grad_norm": 0.35071656107902527, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 1530 + }, + { + "epoch": 1.1951882033372139, + "grad_norm": 0.42783796787261963, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 1540 + }, + { + "epoch": 1.2029491656965463, + "grad_norm": 0.31830692291259766, + "learning_rate": 0.0002, + "loss": 1.7499, + "step": 1550 + }, + { + "epoch": 1.210710128055879, + "grad_norm": 0.3597424626350403, + "learning_rate": 0.0002, + "loss": 1.7372, + "step": 1560 + }, + { + "epoch": 1.2184710904152114, + "grad_norm": 0.35233765840530396, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 1570 + }, + { + "epoch": 1.226232052774544, + "grad_norm": 0.35942912101745605, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 1580 + }, + { + "epoch": 1.2339930151338767, + "grad_norm": 0.36159393191337585, + "learning_rate": 0.0002, + "loss": 1.6598, + "step": 1590 + }, + { + "epoch": 1.2417539774932091, + "grad_norm": 0.3328469693660736, + "learning_rate": 0.0002, + "loss": 1.6697, + "step": 1600 + }, + { + "epoch": 1.2495149398525418, + "grad_norm": 0.3089476525783539, + "learning_rate": 0.0002, + "loss": 1.7594, + "step": 1610 + }, + { + "epoch": 1.2572759022118742, + "grad_norm": 0.30947765707969666, + "learning_rate": 0.0002, + "loss": 1.6805, + "step": 1620 + }, + { + "epoch": 1.265036864571207, + "grad_norm": 0.32154011726379395, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 1630 + }, + { + "epoch": 1.2727978269305393, + "grad_norm": 0.3480297923088074, + "learning_rate": 0.0002, + "loss": 1.6621, + "step": 1640 + }, + { + "epoch": 1.280558789289872, + "grad_norm": 0.39471694827079773, + "learning_rate": 0.0002, + "loss": 1.7087, + "step": 1650 + }, + { + "epoch": 1.2883197516492044, + "grad_norm": 0.35728853940963745, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 1660 + }, + { + "epoch": 1.296080714008537, + "grad_norm": 0.35223081707954407, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 1670 + }, + { + "epoch": 1.3038416763678695, + "grad_norm": 0.3588867485523224, + "learning_rate": 0.0002, + "loss": 1.7253, + "step": 1680 + }, + { + "epoch": 1.3116026387272022, + "grad_norm": 0.3528042733669281, + "learning_rate": 0.0002, + "loss": 1.6505, + "step": 1690 + }, + { + "epoch": 1.3193636010865348, + "grad_norm": 0.35975801944732666, + "learning_rate": 0.0002, + "loss": 1.6945, + "step": 1700 + }, + { + "epoch": 1.3271245634458673, + "grad_norm": 0.36691880226135254, + "learning_rate": 0.0002, + "loss": 1.6631, + "step": 1710 + }, + { + "epoch": 1.3348855258052, + "grad_norm": 0.3787977695465088, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1720 + }, + { + "epoch": 1.3426464881645324, + "grad_norm": 0.36614933609962463, + "learning_rate": 0.0002, + "loss": 1.7697, + "step": 1730 + }, + { + "epoch": 1.350407450523865, + "grad_norm": 0.3484745919704437, + "learning_rate": 0.0002, + "loss": 1.6487, + "step": 1740 + }, + { + "epoch": 1.3581684128831975, + "grad_norm": 0.36905673146247864, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 1750 + }, + { + "epoch": 1.36592937524253, + "grad_norm": 0.41564738750457764, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 1760 + }, + { + "epoch": 1.3736903376018628, + "grad_norm": 0.3345205783843994, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 1770 + }, + { + "epoch": 1.3814512999611952, + "grad_norm": 0.34926071763038635, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 1780 + }, + { + "epoch": 1.3892122623205276, + "grad_norm": 0.42004233598709106, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 1790 + }, + { + "epoch": 1.3969732246798603, + "grad_norm": 0.3576236963272095, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 1800 + }, + { + "epoch": 1.404734187039193, + "grad_norm": 0.3586704432964325, + "learning_rate": 0.0002, + "loss": 1.8516, + "step": 1810 + }, + { + "epoch": 1.4124951493985254, + "grad_norm": 0.3943439722061157, + "learning_rate": 0.0002, + "loss": 1.6171, + "step": 1820 + }, + { + "epoch": 1.420256111757858, + "grad_norm": 0.3484877049922943, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 1830 + }, + { + "epoch": 1.4280170741171905, + "grad_norm": 0.3344518840312958, + "learning_rate": 0.0002, + "loss": 1.7205, + "step": 1840 + }, + { + "epoch": 1.4357780364765231, + "grad_norm": 0.4345698356628418, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1850 + }, + { + "epoch": 1.4435389988358556, + "grad_norm": 0.5525162220001221, + "learning_rate": 0.0002, + "loss": 1.6855, + "step": 1860 + }, + { + "epoch": 1.4512999611951882, + "grad_norm": 0.37194496393203735, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1870 + }, + { + "epoch": 1.4590609235545209, + "grad_norm": 0.34570157527923584, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1880 + }, + { + "epoch": 1.4668218859138533, + "grad_norm": 0.3512282073497772, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 1890 + }, + { + "epoch": 1.4745828482731858, + "grad_norm": 0.3443922996520996, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1900 + }, + { + "epoch": 1.4823438106325184, + "grad_norm": 0.3812018036842346, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 1910 + }, + { + "epoch": 1.490104772991851, + "grad_norm": 0.39263492822647095, + "learning_rate": 0.0002, + "loss": 1.7277, + "step": 1920 + }, + { + "epoch": 1.4978657353511835, + "grad_norm": 0.3146156072616577, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1930 + }, + { + "epoch": 1.505626697710516, + "grad_norm": 0.3653988540172577, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 1940 + }, + { + "epoch": 1.5133876600698488, + "grad_norm": 0.3966596722602844, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 1950 + }, + { + "epoch": 1.5211486224291813, + "grad_norm": 0.3441697359085083, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1960 + }, + { + "epoch": 1.5289095847885137, + "grad_norm": 0.3328564465045929, + "learning_rate": 0.0002, + "loss": 1.7175, + "step": 1970 + }, + { + "epoch": 1.5366705471478463, + "grad_norm": 0.34068772196769714, + "learning_rate": 0.0002, + "loss": 1.7394, + "step": 1980 + }, + { + "epoch": 1.544431509507179, + "grad_norm": 0.3559795916080475, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 1990 + }, + { + "epoch": 1.5521924718665114, + "grad_norm": 0.37888768315315247, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 2000 + }, + { + "epoch": 1.5599534342258439, + "grad_norm": 0.36128363013267517, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 2010 + }, + { + "epoch": 1.5677143965851765, + "grad_norm": 0.3643714487552643, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 2020 + }, + { + "epoch": 1.5754753589445092, + "grad_norm": 0.3863612115383148, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 2030 + }, + { + "epoch": 1.5832363213038416, + "grad_norm": 0.32831457257270813, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 2040 + }, + { + "epoch": 1.5909972836631743, + "grad_norm": 0.36098113656044006, + "learning_rate": 0.0002, + "loss": 1.7404, + "step": 2050 + }, + { + "epoch": 1.598758246022507, + "grad_norm": 1.1079334020614624, + "learning_rate": 0.0002, + "loss": 1.7065, + "step": 2060 + }, + { + "epoch": 1.6065192083818394, + "grad_norm": 0.35615381598472595, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2070 + }, + { + "epoch": 1.6142801707411718, + "grad_norm": 0.369711309671402, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 2080 + }, + { + "epoch": 1.6220411331005045, + "grad_norm": 0.390658438205719, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 2090 + }, + { + "epoch": 1.6298020954598371, + "grad_norm": 0.3422999382019043, + "learning_rate": 0.0002, + "loss": 1.6996, + "step": 2100 + }, + { + "epoch": 1.6375630578191696, + "grad_norm": 0.372475266456604, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 2110 + }, + { + "epoch": 1.645324020178502, + "grad_norm": 0.35660576820373535, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 2120 + }, + { + "epoch": 1.6530849825378346, + "grad_norm": 0.35754942893981934, + "learning_rate": 0.0002, + "loss": 1.6991, + "step": 2130 + }, + { + "epoch": 1.6608459448971673, + "grad_norm": 0.34572410583496094, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 2140 + }, + { + "epoch": 1.6686069072564997, + "grad_norm": 0.42059701681137085, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 2150 + }, + { + "epoch": 1.6763678696158324, + "grad_norm": 0.35200759768486023, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 2160 + }, + { + "epoch": 1.684128831975165, + "grad_norm": 0.3704029321670532, + "learning_rate": 0.0002, + "loss": 1.6869, + "step": 2170 + }, + { + "epoch": 1.6918897943344975, + "grad_norm": 0.40450501441955566, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 2180 + }, + { + "epoch": 1.69965075669383, + "grad_norm": 0.362966924905777, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2190 + }, + { + "epoch": 1.7074117190531626, + "grad_norm": 0.36586204171180725, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 2200 + }, + { + "epoch": 1.7151726814124952, + "grad_norm": 0.3295372426509857, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 2210 + }, + { + "epoch": 1.7229336437718277, + "grad_norm": 0.3892575800418854, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 2220 + }, + { + "epoch": 1.73069460613116, + "grad_norm": 0.34712135791778564, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 2230 + }, + { + "epoch": 1.738455568490493, + "grad_norm": 0.34801796078681946, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 2240 + }, + { + "epoch": 1.7462165308498254, + "grad_norm": 0.3822397291660309, + "learning_rate": 0.0002, + "loss": 1.7009, + "step": 2250 + }, + { + "epoch": 1.7539774932091579, + "grad_norm": 0.38933250308036804, + "learning_rate": 0.0002, + "loss": 1.6546, + "step": 2260 + }, + { + "epoch": 1.7617384555684905, + "grad_norm": 0.3798373341560364, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 2270 + }, + { + "epoch": 1.7694994179278232, + "grad_norm": 0.35151317715644836, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 2280 + }, + { + "epoch": 1.7772603802871556, + "grad_norm": 0.44981494545936584, + "learning_rate": 0.0002, + "loss": 1.6894, + "step": 2290 + }, + { + "epoch": 1.785021342646488, + "grad_norm": 0.3992624580860138, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 2300 + }, + { + "epoch": 1.7927823050058207, + "grad_norm": 0.3772512376308441, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 2310 + }, + { + "epoch": 1.8005432673651534, + "grad_norm": 0.3511589467525482, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2320 + }, + { + "epoch": 1.8083042297244858, + "grad_norm": 0.3805285394191742, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 2330 + }, + { + "epoch": 1.8160651920838184, + "grad_norm": 0.3792071044445038, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 2340 + }, + { + "epoch": 1.823826154443151, + "grad_norm": 0.36430829763412476, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 2350 + }, + { + "epoch": 1.8315871168024835, + "grad_norm": 0.36502477526664734, + "learning_rate": 0.0002, + "loss": 1.6773, + "step": 2360 + }, + { + "epoch": 1.839348079161816, + "grad_norm": 0.35015153884887695, + "learning_rate": 0.0002, + "loss": 1.8072, + "step": 2370 + }, + { + "epoch": 1.8471090415211486, + "grad_norm": 0.3710903823375702, + "learning_rate": 0.0002, + "loss": 1.7734, + "step": 2380 + }, + { + "epoch": 1.8548700038804813, + "grad_norm": 0.3542828857898712, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 2390 + }, + { + "epoch": 1.8626309662398137, + "grad_norm": 0.35467568039894104, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 2400 + }, + { + "epoch": 1.8703919285991462, + "grad_norm": 0.3638560473918915, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 2410 + }, + { + "epoch": 1.8781528909584788, + "grad_norm": 0.3823298215866089, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 2420 + }, + { + "epoch": 1.8859138533178115, + "grad_norm": 0.3926416337490082, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 2430 + }, + { + "epoch": 1.893674815677144, + "grad_norm": 0.3608079254627228, + "learning_rate": 0.0002, + "loss": 1.71, + "step": 2440 + }, + { + "epoch": 1.9014357780364766, + "grad_norm": 0.3426613509654999, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 2450 + }, + { + "epoch": 1.9091967403958092, + "grad_norm": 0.3522338569164276, + "learning_rate": 0.0002, + "loss": 1.6892, + "step": 2460 + }, + { + "epoch": 1.9169577027551417, + "grad_norm": 0.3608049154281616, + "learning_rate": 0.0002, + "loss": 1.7307, + "step": 2470 + }, + { + "epoch": 1.924718665114474, + "grad_norm": 0.3849755525588989, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 2480 + }, + { + "epoch": 1.9324796274738067, + "grad_norm": 0.4154011011123657, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 2490 + }, + { + "epoch": 1.9402405898331394, + "grad_norm": 0.3602796792984009, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 2500 + }, + { + "epoch": 1.9480015521924718, + "grad_norm": 0.3702992796897888, + "learning_rate": 0.0002, + "loss": 1.7843, + "step": 2510 + }, + { + "epoch": 1.9557625145518043, + "grad_norm": 0.3657735288143158, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 2520 + }, + { + "epoch": 1.963523476911137, + "grad_norm": 0.41031739115715027, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2530 + }, + { + "epoch": 1.9712844392704696, + "grad_norm": 0.34578680992126465, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 2540 + }, + { + "epoch": 1.979045401629802, + "grad_norm": 0.3361521065235138, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 2550 + }, + { + "epoch": 1.9868063639891347, + "grad_norm": 0.34342363476753235, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 2560 + }, + { + "epoch": 1.9945673263484673, + "grad_norm": 0.32954007387161255, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 2570 + }, + { + "epoch": 2.0, + "eval_loss": 1.8068748712539673, + "eval_runtime": 105.5885, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 2577 + }, + { + "epoch": 2.0023282887077998, + "grad_norm": 0.336302250623703, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 2580 + }, + { + "epoch": 2.010089251067132, + "grad_norm": 0.3627048432826996, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 2590 + }, + { + "epoch": 2.017850213426465, + "grad_norm": 0.38406702876091003, + "learning_rate": 0.0002, + "loss": 1.4908, + "step": 2600 + }, + { + "epoch": 2.0256111757857975, + "grad_norm": 0.5326781272888184, + "learning_rate": 0.0002, + "loss": 1.5368, + "step": 2610 + }, + { + "epoch": 2.03337213814513, + "grad_norm": 0.4774554967880249, + "learning_rate": 0.0002, + "loss": 1.5727, + "step": 2620 + }, + { + "epoch": 2.0411331005044624, + "grad_norm": 0.4251810312271118, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 2630 + }, + { + "epoch": 2.0488940628637953, + "grad_norm": 0.4693007171154022, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 2640 + }, + { + "epoch": 2.0566550252231277, + "grad_norm": 0.46371519565582275, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 2650 + }, + { + "epoch": 2.06441598758246, + "grad_norm": 0.46652570366859436, + "learning_rate": 0.0002, + "loss": 1.6304, + "step": 2660 + }, + { + "epoch": 2.0721769499417926, + "grad_norm": 0.45200315117836, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 2670 + }, + { + "epoch": 2.0799379123011255, + "grad_norm": 0.42905205488204956, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 2680 + }, + { + "epoch": 2.087698874660458, + "grad_norm": 0.44509148597717285, + "learning_rate": 0.0002, + "loss": 1.5401, + "step": 2690 + }, + { + "epoch": 2.0954598370197903, + "grad_norm": 0.4445319175720215, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 2700 + }, + { + "epoch": 2.103220799379123, + "grad_norm": 0.46825504302978516, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 2710 + }, + { + "epoch": 2.1109817617384556, + "grad_norm": 0.4623856842517853, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 2720 + }, + { + "epoch": 2.118742724097788, + "grad_norm": 0.4833452105522156, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2730 + }, + { + "epoch": 2.1265036864571205, + "grad_norm": 0.4582686722278595, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 2740 + }, + { + "epoch": 2.1342646488164534, + "grad_norm": 0.47587934136390686, + "learning_rate": 0.0002, + "loss": 1.5801, + "step": 2750 + }, + { + "epoch": 2.142025611175786, + "grad_norm": 0.4602217972278595, + "learning_rate": 0.0002, + "loss": 1.594, + "step": 2760 + }, + { + "epoch": 2.1497865735351183, + "grad_norm": 0.47501352429389954, + "learning_rate": 0.0002, + "loss": 1.5271, + "step": 2770 + }, + { + "epoch": 2.1575475358944507, + "grad_norm": 0.5078499913215637, + "learning_rate": 0.0002, + "loss": 1.4862, + "step": 2780 + }, + { + "epoch": 2.1653084982537836, + "grad_norm": 0.497704416513443, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 2790 + }, + { + "epoch": 2.173069460613116, + "grad_norm": 0.5435971617698669, + "learning_rate": 0.0002, + "loss": 1.5597, + "step": 2800 + }, + { + "epoch": 2.1808304229724484, + "grad_norm": 0.5172356367111206, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 2810 + }, + { + "epoch": 2.1885913853317813, + "grad_norm": 0.44063422083854675, + "learning_rate": 0.0002, + "loss": 1.5202, + "step": 2820 + }, + { + "epoch": 2.1963523476911138, + "grad_norm": 0.5079569220542908, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 2830 + }, + { + "epoch": 2.204113310050446, + "grad_norm": 0.45658132433891296, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2840 + }, + { + "epoch": 2.2118742724097786, + "grad_norm": 0.5103023648262024, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 2850 + }, + { + "epoch": 2.2196352347691115, + "grad_norm": 0.4882226288318634, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 2860 + }, + { + "epoch": 2.227396197128444, + "grad_norm": 0.5087296962738037, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 2870 + }, + { + "epoch": 2.2351571594877764, + "grad_norm": 0.45293712615966797, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 2880 + }, + { + "epoch": 2.242918121847109, + "grad_norm": 0.5120379328727722, + "learning_rate": 0.0002, + "loss": 1.6214, + "step": 2890 + }, + { + "epoch": 2.2506790842064417, + "grad_norm": 0.47126415371894836, + "learning_rate": 0.0002, + "loss": 1.5273, + "step": 2900 + }, + { + "epoch": 2.258440046565774, + "grad_norm": 0.44005846977233887, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 2910 + }, + { + "epoch": 2.2662010089251066, + "grad_norm": 0.46476176381111145, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 2920 + }, + { + "epoch": 2.2739619712844394, + "grad_norm": 0.48051515221595764, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 2930 + }, + { + "epoch": 2.281722933643772, + "grad_norm": 0.480069637298584, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2940 + }, + { + "epoch": 2.2894838960031043, + "grad_norm": 0.5122102499008179, + "learning_rate": 0.0002, + "loss": 1.5747, + "step": 2950 + }, + { + "epoch": 2.2972448583624367, + "grad_norm": 0.48879891633987427, + "learning_rate": 0.0002, + "loss": 1.5183, + "step": 2960 + }, + { + "epoch": 2.3050058207217696, + "grad_norm": 0.4973136782646179, + "learning_rate": 0.0002, + "loss": 1.5483, + "step": 2970 + }, + { + "epoch": 2.312766783081102, + "grad_norm": 0.5522695183753967, + "learning_rate": 0.0002, + "loss": 1.677, + "step": 2980 + }, + { + "epoch": 2.3205277454404345, + "grad_norm": 0.5220217704772949, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 2990 + }, + { + "epoch": 2.328288707799767, + "grad_norm": 0.4978662431240082, + "learning_rate": 0.0002, + "loss": 1.6299, + "step": 3000 + }, + { + "epoch": 2.3360496701591, + "grad_norm": 0.554053544998169, + "learning_rate": 0.0002, + "loss": 1.5498, + "step": 3010 + }, + { + "epoch": 2.3438106325184322, + "grad_norm": 0.4703886806964874, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 3020 + }, + { + "epoch": 2.3515715948777647, + "grad_norm": 0.5074123740196228, + "learning_rate": 0.0002, + "loss": 1.5418, + "step": 3030 + }, + { + "epoch": 2.3593325572370976, + "grad_norm": 0.5088278651237488, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 3040 + }, + { + "epoch": 2.36709351959643, + "grad_norm": 0.4752114415168762, + "learning_rate": 0.0002, + "loss": 1.5249, + "step": 3050 + }, + { + "epoch": 2.3748544819557624, + "grad_norm": 0.5121659636497498, + "learning_rate": 0.0002, + "loss": 1.5353, + "step": 3060 + }, + { + "epoch": 2.3826154443150953, + "grad_norm": 0.48649218678474426, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 3070 + }, + { + "epoch": 2.3903764066744277, + "grad_norm": 0.5209488868713379, + "learning_rate": 0.0002, + "loss": 1.6136, + "step": 3080 + }, + { + "epoch": 2.39813736903376, + "grad_norm": 0.5110517740249634, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 3090 + }, + { + "epoch": 2.4058983313930926, + "grad_norm": 0.5609337091445923, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3100 + }, + { + "epoch": 2.4136592937524255, + "grad_norm": 0.5191826224327087, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 3110 + }, + { + "epoch": 2.421420256111758, + "grad_norm": 0.4876069724559784, + "learning_rate": 0.0002, + "loss": 1.6347, + "step": 3120 + }, + { + "epoch": 2.4291812184710904, + "grad_norm": 0.4713933765888214, + "learning_rate": 0.0002, + "loss": 1.5565, + "step": 3130 + }, + { + "epoch": 2.436942180830423, + "grad_norm": 0.5102227330207825, + "learning_rate": 0.0002, + "loss": 1.6388, + "step": 3140 + }, + { + "epoch": 2.4447031431897557, + "grad_norm": 0.44546666741371155, + "learning_rate": 0.0002, + "loss": 1.5667, + "step": 3150 + }, + { + "epoch": 2.452464105549088, + "grad_norm": 0.5167558193206787, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3160 + }, + { + "epoch": 2.4602250679084205, + "grad_norm": 0.5226958990097046, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 3170 + }, + { + "epoch": 2.4679860302677534, + "grad_norm": 0.4751799702644348, + "learning_rate": 0.0002, + "loss": 1.5758, + "step": 3180 + }, + { + "epoch": 2.475746992627086, + "grad_norm": 0.4744729697704315, + "learning_rate": 0.0002, + "loss": 1.6234, + "step": 3190 + }, + { + "epoch": 2.4835079549864183, + "grad_norm": 0.5203230381011963, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 3200 + }, + { + "epoch": 2.4912689173457507, + "grad_norm": 0.47209781408309937, + "learning_rate": 0.0002, + "loss": 1.493, + "step": 3210 + }, + { + "epoch": 2.4990298797050836, + "grad_norm": 0.5241674780845642, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3220 + }, + { + "epoch": 2.506790842064416, + "grad_norm": 0.5152244567871094, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 3230 + }, + { + "epoch": 2.5145518044237485, + "grad_norm": 0.5216741561889648, + "learning_rate": 0.0002, + "loss": 1.6248, + "step": 3240 + }, + { + "epoch": 2.522312766783081, + "grad_norm": 0.4953259527683258, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 3250 + }, + { + "epoch": 2.530073729142414, + "grad_norm": 0.5973829030990601, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 3260 + }, + { + "epoch": 2.5378346915017462, + "grad_norm": 0.48804202675819397, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 3270 + }, + { + "epoch": 2.5455956538610787, + "grad_norm": 0.5334644317626953, + "learning_rate": 0.0002, + "loss": 1.4954, + "step": 3280 + }, + { + "epoch": 2.5533566162204115, + "grad_norm": 0.46873313188552856, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 3290 + }, + { + "epoch": 2.561117578579744, + "grad_norm": 0.4282589554786682, + "learning_rate": 0.0002, + "loss": 1.5362, + "step": 3300 + }, + { + "epoch": 2.5688785409390764, + "grad_norm": 0.4848293960094452, + "learning_rate": 0.0002, + "loss": 1.6278, + "step": 3310 + }, + { + "epoch": 2.576639503298409, + "grad_norm": 0.5093745589256287, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 3320 + }, + { + "epoch": 2.5844004656577413, + "grad_norm": 0.5084842443466187, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 3330 + }, + { + "epoch": 2.592161428017074, + "grad_norm": 0.4696281850337982, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 3340 + }, + { + "epoch": 2.5999223903764066, + "grad_norm": 0.5767765641212463, + "learning_rate": 0.0002, + "loss": 1.5359, + "step": 3350 + }, + { + "epoch": 2.607683352735739, + "grad_norm": 0.47300875186920166, + "learning_rate": 0.0002, + "loss": 1.6097, + "step": 3360 + }, + { + "epoch": 2.615444315095072, + "grad_norm": 0.4809158146381378, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 3370 + }, + { + "epoch": 2.6232052774544043, + "grad_norm": 0.5141063928604126, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 3380 + }, + { + "epoch": 2.630966239813737, + "grad_norm": 0.4832935035228729, + "learning_rate": 0.0002, + "loss": 1.5784, + "step": 3390 + }, + { + "epoch": 2.6387272021730697, + "grad_norm": 0.5044625401496887, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 3400 + }, + { + "epoch": 2.646488164532402, + "grad_norm": 0.5287680625915527, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 3410 + }, + { + "epoch": 2.6542491268917345, + "grad_norm": 0.5306379795074463, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 3420 + }, + { + "epoch": 2.662010089251067, + "grad_norm": 0.5849291682243347, + "learning_rate": 0.0002, + "loss": 1.5264, + "step": 3430 + }, + { + "epoch": 2.6697710516104, + "grad_norm": 0.7951080799102783, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 3440 + }, + { + "epoch": 2.6775320139697323, + "grad_norm": 0.48087653517723083, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 3450 + }, + { + "epoch": 2.6852929763290647, + "grad_norm": 0.5396431684494019, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 3460 + }, + { + "epoch": 2.693053938688397, + "grad_norm": 0.5481634736061096, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 3470 + }, + { + "epoch": 2.70081490104773, + "grad_norm": 0.5068731307983398, + "learning_rate": 0.0002, + "loss": 1.6436, + "step": 3480 + }, + { + "epoch": 2.7085758634070625, + "grad_norm": 0.5759826898574829, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 3490 + }, + { + "epoch": 2.716336825766395, + "grad_norm": 0.7253932952880859, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 3500 + }, + { + "epoch": 2.724097788125728, + "grad_norm": 0.527745246887207, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 3510 + }, + { + "epoch": 2.73185875048506, + "grad_norm": 0.5279242396354675, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 3520 + }, + { + "epoch": 2.7396197128443927, + "grad_norm": 0.5047839283943176, + "learning_rate": 0.0002, + "loss": 1.6768, + "step": 3530 + }, + { + "epoch": 2.7473806752037255, + "grad_norm": 0.5430883169174194, + "learning_rate": 0.0002, + "loss": 1.5517, + "step": 3540 + }, + { + "epoch": 2.755141637563058, + "grad_norm": 0.4496723711490631, + "learning_rate": 0.0002, + "loss": 1.5624, + "step": 3550 + }, + { + "epoch": 2.7629025999223904, + "grad_norm": 0.5063338875770569, + "learning_rate": 0.0002, + "loss": 1.5789, + "step": 3560 + }, + { + "epoch": 2.770663562281723, + "grad_norm": 0.4619026780128479, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 3570 + }, + { + "epoch": 2.7784245246410553, + "grad_norm": 0.4753304123878479, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 3580 + }, + { + "epoch": 2.786185487000388, + "grad_norm": 0.5422708988189697, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 3590 + }, + { + "epoch": 2.7939464493597206, + "grad_norm": 0.4756578803062439, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 3600 + }, + { + "epoch": 2.801707411719053, + "grad_norm": 0.5057567358016968, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 3610 + }, + { + "epoch": 2.809468374078386, + "grad_norm": 0.5410919785499573, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 3620 + }, + { + "epoch": 2.8172293364377183, + "grad_norm": 0.4958136975765228, + "learning_rate": 0.0002, + "loss": 1.5573, + "step": 3630 + }, + { + "epoch": 2.8249902987970508, + "grad_norm": 0.454527348279953, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 3640 + }, + { + "epoch": 2.8327512611563836, + "grad_norm": 0.5092706084251404, + "learning_rate": 0.0002, + "loss": 1.5582, + "step": 3650 + }, + { + "epoch": 2.840512223515716, + "grad_norm": 0.5314022302627563, + "learning_rate": 0.0002, + "loss": 1.5893, + "step": 3660 + }, + { + "epoch": 2.8482731858750485, + "grad_norm": 0.5028239488601685, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 3670 + }, + { + "epoch": 2.856034148234381, + "grad_norm": 0.5127444863319397, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 3680 + }, + { + "epoch": 2.8637951105937134, + "grad_norm": 0.5045645236968994, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 3690 + }, + { + "epoch": 2.8715560729530463, + "grad_norm": 0.5560781955718994, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 3700 + }, + { + "epoch": 2.8793170353123787, + "grad_norm": 0.5177600383758545, + "learning_rate": 0.0002, + "loss": 1.5988, + "step": 3710 + }, + { + "epoch": 2.887077997671711, + "grad_norm": 0.45830899477005005, + "learning_rate": 0.0002, + "loss": 1.6009, + "step": 3720 + }, + { + "epoch": 2.894838960031044, + "grad_norm": 0.4828629195690155, + "learning_rate": 0.0002, + "loss": 1.6344, + "step": 3730 + }, + { + "epoch": 2.9025999223903765, + "grad_norm": 0.48241183161735535, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 3740 + }, + { + "epoch": 2.910360884749709, + "grad_norm": 0.4909592568874359, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 3750 + }, + { + "epoch": 2.9181218471090418, + "grad_norm": 0.44677025079727173, + "learning_rate": 0.0002, + "loss": 1.4927, + "step": 3760 + }, + { + "epoch": 2.925882809468374, + "grad_norm": 0.4928834140300751, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 3770 + }, + { + "epoch": 2.9336437718277066, + "grad_norm": 0.5673553347587585, + "learning_rate": 0.0002, + "loss": 1.5843, + "step": 3780 + }, + { + "epoch": 2.941404734187039, + "grad_norm": 0.548190712928772, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 3790 + }, + { + "epoch": 2.9491656965463715, + "grad_norm": 0.48979803919792175, + "learning_rate": 0.0002, + "loss": 1.5892, + "step": 3800 + }, + { + "epoch": 2.9569266589057044, + "grad_norm": 0.533191978931427, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 3810 + }, + { + "epoch": 2.964687621265037, + "grad_norm": 0.5362946391105652, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 3820 + }, + { + "epoch": 2.9724485836243693, + "grad_norm": 0.4724906384944916, + "learning_rate": 0.0002, + "loss": 1.6602, + "step": 3830 + }, + { + "epoch": 2.980209545983702, + "grad_norm": 0.5468461513519287, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 3840 + }, + { + "epoch": 2.9879705083430346, + "grad_norm": 0.4697108864784241, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 3850 + }, + { + "epoch": 2.995731470702367, + "grad_norm": 0.4780906140804291, + "learning_rate": 0.0002, + "loss": 1.6312, + "step": 3860 + }, + { + "epoch": 2.9996119518820334, + "eval_loss": 1.8472607135772705, + "eval_runtime": 106.5541, + "eval_samples_per_second": 4.758, + "eval_steps_per_second": 0.601, + "step": 3865 + }, + { + "epoch": 3.0034924330616994, + "grad_norm": 0.5645653605461121, + "learning_rate": 0.0002, + "loss": 1.4983, + "step": 3870 + }, + { + "epoch": 3.0112533954210323, + "grad_norm": 0.6457151174545288, + "learning_rate": 0.0002, + "loss": 1.4334, + "step": 3880 + }, + { + "epoch": 3.0190143577803648, + "grad_norm": 0.583838164806366, + "learning_rate": 0.0002, + "loss": 1.3899, + "step": 3890 + }, + { + "epoch": 3.026775320139697, + "grad_norm": 0.6819260120391846, + "learning_rate": 0.0002, + "loss": 1.3258, + "step": 3900 + }, + { + "epoch": 3.03453628249903, + "grad_norm": 0.6692903637886047, + "learning_rate": 0.0002, + "loss": 1.3458, + "step": 3910 + }, + { + "epoch": 3.0422972448583625, + "grad_norm": 0.6101024746894836, + "learning_rate": 0.0002, + "loss": 1.4356, + "step": 3920 + }, + { + "epoch": 3.050058207217695, + "grad_norm": 0.7014093399047852, + "learning_rate": 0.0002, + "loss": 1.394, + "step": 3930 + }, + { + "epoch": 3.0578191695770274, + "grad_norm": 0.7380381226539612, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 3940 + }, + { + "epoch": 3.0655801319363603, + "grad_norm": 0.6607900857925415, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 3950 + }, + { + "epoch": 3.0733410942956927, + "grad_norm": 0.735263466835022, + "learning_rate": 0.0002, + "loss": 1.4293, + "step": 3960 + }, + { + "epoch": 3.081102056655025, + "grad_norm": 0.6788513660430908, + "learning_rate": 0.0002, + "loss": 1.3966, + "step": 3970 + }, + { + "epoch": 3.088863019014358, + "grad_norm": 0.6347652673721313, + "learning_rate": 0.0002, + "loss": 1.3435, + "step": 3980 + }, + { + "epoch": 3.0966239813736904, + "grad_norm": 0.7056642770767212, + "learning_rate": 0.0002, + "loss": 1.4518, + "step": 3990 + }, + { + "epoch": 3.104384943733023, + "grad_norm": 0.6387075185775757, + "learning_rate": 0.0002, + "loss": 1.4474, + "step": 4000 + }, + { + "epoch": 3.1121459060923553, + "grad_norm": 0.6701116561889648, + "learning_rate": 0.0002, + "loss": 1.3833, + "step": 4010 + }, + { + "epoch": 3.119906868451688, + "grad_norm": 0.7558449506759644, + "learning_rate": 0.0002, + "loss": 1.404, + "step": 4020 + }, + { + "epoch": 3.1276678308110206, + "grad_norm": 0.6612881422042847, + "learning_rate": 0.0002, + "loss": 1.3294, + "step": 4030 + }, + { + "epoch": 3.135428793170353, + "grad_norm": 0.7474587559700012, + "learning_rate": 0.0002, + "loss": 1.439, + "step": 4040 + }, + { + "epoch": 3.1431897555296855, + "grad_norm": 0.7292373776435852, + "learning_rate": 0.0002, + "loss": 1.4616, + "step": 4050 + }, + { + "epoch": 3.1509507178890184, + "grad_norm": 0.7432886958122253, + "learning_rate": 0.0002, + "loss": 1.3908, + "step": 4060 + }, + { + "epoch": 3.158711680248351, + "grad_norm": 0.6366098523139954, + "learning_rate": 0.0002, + "loss": 1.4214, + "step": 4070 + }, + { + "epoch": 3.1664726426076832, + "grad_norm": 0.6837611794471741, + "learning_rate": 0.0002, + "loss": 1.5044, + "step": 4080 + }, + { + "epoch": 3.174233604967016, + "grad_norm": 0.7194393277168274, + "learning_rate": 0.0002, + "loss": 1.4332, + "step": 4090 + }, + { + "epoch": 3.1819945673263486, + "grad_norm": 0.6963607668876648, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 4100 + }, + { + "epoch": 3.189755529685681, + "grad_norm": 0.6404902935028076, + "learning_rate": 0.0002, + "loss": 1.4127, + "step": 4110 + }, + { + "epoch": 3.1975164920450134, + "grad_norm": 0.7172070741653442, + "learning_rate": 0.0002, + "loss": 1.4394, + "step": 4120 + }, + { + "epoch": 3.2052774544043463, + "grad_norm": 0.6577759385108948, + "learning_rate": 0.0002, + "loss": 1.4658, + "step": 4130 + }, + { + "epoch": 3.2130384167636787, + "grad_norm": 0.6658480167388916, + "learning_rate": 0.0002, + "loss": 1.4019, + "step": 4140 + }, + { + "epoch": 3.220799379123011, + "grad_norm": 0.6771699786186218, + "learning_rate": 0.0002, + "loss": 1.4348, + "step": 4150 + }, + { + "epoch": 3.2285603414823436, + "grad_norm": 0.699035108089447, + "learning_rate": 0.0002, + "loss": 1.4736, + "step": 4160 + }, + { + "epoch": 3.2363213038416765, + "grad_norm": 0.7218514680862427, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 4170 + }, + { + "epoch": 3.244082266201009, + "grad_norm": 0.6270631551742554, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 4180 + }, + { + "epoch": 3.2518432285603414, + "grad_norm": 0.6828921437263489, + "learning_rate": 0.0002, + "loss": 1.4076, + "step": 4190 + }, + { + "epoch": 3.2596041909196742, + "grad_norm": 0.6005498170852661, + "learning_rate": 0.0002, + "loss": 1.4663, + "step": 4200 + }, + { + "epoch": 3.2673651532790067, + "grad_norm": 0.6974790692329407, + "learning_rate": 0.0002, + "loss": 1.4798, + "step": 4210 + }, + { + "epoch": 3.275126115638339, + "grad_norm": 0.7269543409347534, + "learning_rate": 0.0002, + "loss": 1.5012, + "step": 4220 + }, + { + "epoch": 3.2828870779976715, + "grad_norm": 0.6728787422180176, + "learning_rate": 0.0002, + "loss": 1.3848, + "step": 4230 + }, + { + "epoch": 3.2906480403570044, + "grad_norm": 0.676972508430481, + "learning_rate": 0.0002, + "loss": 1.4112, + "step": 4240 + }, + { + "epoch": 3.298409002716337, + "grad_norm": 0.748309314250946, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 4250 + }, + { + "epoch": 3.3061699650756693, + "grad_norm": 0.6976589560508728, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 4260 + }, + { + "epoch": 3.3139309274350017, + "grad_norm": 0.649780809879303, + "learning_rate": 0.0002, + "loss": 1.3967, + "step": 4270 + }, + { + "epoch": 3.3216918897943346, + "grad_norm": 0.6529902815818787, + "learning_rate": 0.0002, + "loss": 1.327, + "step": 4280 + }, + { + "epoch": 3.329452852153667, + "grad_norm": 0.9273163676261902, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 4290 + }, + { + "epoch": 3.3372138145129995, + "grad_norm": 0.717024028301239, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 4300 + }, + { + "epoch": 3.3449747768723324, + "grad_norm": 0.7914950251579285, + "learning_rate": 0.0002, + "loss": 1.4441, + "step": 4310 + }, + { + "epoch": 3.352735739231665, + "grad_norm": 0.7133203148841858, + "learning_rate": 0.0002, + "loss": 1.432, + "step": 4320 + }, + { + "epoch": 3.3604967015909972, + "grad_norm": 0.7409568428993225, + "learning_rate": 0.0002, + "loss": 1.4662, + "step": 4330 + }, + { + "epoch": 3.3682576639503297, + "grad_norm": 0.6993981003761292, + "learning_rate": 0.0002, + "loss": 1.3992, + "step": 4340 + }, + { + "epoch": 3.3760186263096625, + "grad_norm": 0.7114535570144653, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 4350 + }, + { + "epoch": 3.383779588668995, + "grad_norm": 0.6790860295295715, + "learning_rate": 0.0002, + "loss": 1.4227, + "step": 4360 + }, + { + "epoch": 3.3915405510283274, + "grad_norm": 0.6507849097251892, + "learning_rate": 0.0002, + "loss": 1.4128, + "step": 4370 + }, + { + "epoch": 3.39930151338766, + "grad_norm": 0.5967804193496704, + "learning_rate": 0.0002, + "loss": 1.4559, + "step": 4380 + }, + { + "epoch": 3.4070624757469927, + "grad_norm": 0.6625847816467285, + "learning_rate": 0.0002, + "loss": 1.3687, + "step": 4390 + }, + { + "epoch": 3.414823438106325, + "grad_norm": 0.6736508011817932, + "learning_rate": 0.0002, + "loss": 1.4193, + "step": 4400 + }, + { + "epoch": 3.4225844004656576, + "grad_norm": 0.7870860695838928, + "learning_rate": 0.0002, + "loss": 1.4363, + "step": 4410 + }, + { + "epoch": 3.4303453628249905, + "grad_norm": 0.7205295562744141, + "learning_rate": 0.0002, + "loss": 1.4114, + "step": 4420 + }, + { + "epoch": 3.438106325184323, + "grad_norm": 0.6634634137153625, + "learning_rate": 0.0002, + "loss": 1.4131, + "step": 4430 + }, + { + "epoch": 3.4458672875436553, + "grad_norm": 0.7562733292579651, + "learning_rate": 0.0002, + "loss": 1.4683, + "step": 4440 + }, + { + "epoch": 3.453628249902988, + "grad_norm": 0.6585879921913147, + "learning_rate": 0.0002, + "loss": 1.3486, + "step": 4450 + }, + { + "epoch": 3.4613892122623207, + "grad_norm": 0.6896792054176331, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 4460 + }, + { + "epoch": 3.469150174621653, + "grad_norm": 0.6520342230796814, + "learning_rate": 0.0002, + "loss": 1.4208, + "step": 4470 + }, + { + "epoch": 3.4769111369809855, + "grad_norm": 0.6760806441307068, + "learning_rate": 0.0002, + "loss": 1.3423, + "step": 4480 + }, + { + "epoch": 3.484672099340318, + "grad_norm": 0.7539774179458618, + "learning_rate": 0.0002, + "loss": 1.4398, + "step": 4490 + }, + { + "epoch": 3.492433061699651, + "grad_norm": 0.7409411668777466, + "learning_rate": 0.0002, + "loss": 1.4534, + "step": 4500 + }, + { + "epoch": 3.5001940240589833, + "grad_norm": 0.6876253485679626, + "learning_rate": 0.0002, + "loss": 1.4069, + "step": 4510 + }, + { + "epoch": 3.5079549864183157, + "grad_norm": 0.7028461694717407, + "learning_rate": 0.0002, + "loss": 1.4228, + "step": 4520 + }, + { + "epoch": 3.5157159487776486, + "grad_norm": 0.8056529760360718, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 4530 + }, + { + "epoch": 3.523476911136981, + "grad_norm": 0.711338996887207, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 4540 + }, + { + "epoch": 3.5312378734963135, + "grad_norm": 0.7343552708625793, + "learning_rate": 0.0002, + "loss": 1.5247, + "step": 4550 + }, + { + "epoch": 3.5389988358556463, + "grad_norm": 0.745479941368103, + "learning_rate": 0.0002, + "loss": 1.4308, + "step": 4560 + }, + { + "epoch": 3.5467597982149788, + "grad_norm": 0.7582294940948486, + "learning_rate": 0.0002, + "loss": 1.4229, + "step": 4570 + }, + { + "epoch": 3.554520760574311, + "grad_norm": 0.6717444658279419, + "learning_rate": 0.0002, + "loss": 1.4127, + "step": 4580 + }, + { + "epoch": 3.5622817229336436, + "grad_norm": 0.7417883276939392, + "learning_rate": 0.0002, + "loss": 1.4368, + "step": 4590 + }, + { + "epoch": 3.570042685292976, + "grad_norm": 0.6385737061500549, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 4600 + }, + { + "epoch": 3.577803647652309, + "grad_norm": 0.716704249382019, + "learning_rate": 0.0002, + "loss": 1.3981, + "step": 4610 + }, + { + "epoch": 3.5855646100116414, + "grad_norm": 0.6948980093002319, + "learning_rate": 0.0002, + "loss": 1.3889, + "step": 4620 + }, + { + "epoch": 3.593325572370974, + "grad_norm": 0.6961140036582947, + "learning_rate": 0.0002, + "loss": 1.5177, + "step": 4630 + }, + { + "epoch": 3.6010865347303067, + "grad_norm": 0.7493122220039368, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 4640 + }, + { + "epoch": 3.608847497089639, + "grad_norm": 0.7431658506393433, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 4650 + }, + { + "epoch": 3.6166084594489716, + "grad_norm": 0.8353387713432312, + "learning_rate": 0.0002, + "loss": 1.4551, + "step": 4660 + }, + { + "epoch": 3.6243694218083045, + "grad_norm": 0.7095612287521362, + "learning_rate": 0.0002, + "loss": 1.4533, + "step": 4670 + }, + { + "epoch": 3.632130384167637, + "grad_norm": 0.776620090007782, + "learning_rate": 0.0002, + "loss": 1.4003, + "step": 4680 + }, + { + "epoch": 3.6398913465269693, + "grad_norm": 0.7198925018310547, + "learning_rate": 0.0002, + "loss": 1.4361, + "step": 4690 + }, + { + "epoch": 3.6476523088863018, + "grad_norm": 0.8238834738731384, + "learning_rate": 0.0002, + "loss": 1.4543, + "step": 4700 + }, + { + "epoch": 3.655413271245634, + "grad_norm": 0.6804245710372925, + "learning_rate": 0.0002, + "loss": 1.3958, + "step": 4710 + }, + { + "epoch": 3.663174233604967, + "grad_norm": 0.8444845676422119, + "learning_rate": 0.0002, + "loss": 1.4158, + "step": 4720 + }, + { + "epoch": 3.6709351959642995, + "grad_norm": 0.743797779083252, + "learning_rate": 0.0002, + "loss": 1.3825, + "step": 4730 + }, + { + "epoch": 3.678696158323632, + "grad_norm": 0.8994188904762268, + "learning_rate": 0.0002, + "loss": 1.4213, + "step": 4740 + }, + { + "epoch": 3.686457120682965, + "grad_norm": 0.75416100025177, + "learning_rate": 0.0002, + "loss": 1.4281, + "step": 4750 + }, + { + "epoch": 3.6942180830422973, + "grad_norm": 0.6499266028404236, + "learning_rate": 0.0002, + "loss": 1.4154, + "step": 4760 + }, + { + "epoch": 3.7019790454016297, + "grad_norm": 0.7246791124343872, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 4770 + }, + { + "epoch": 3.7097400077609626, + "grad_norm": 0.7831124067306519, + "learning_rate": 0.0002, + "loss": 1.426, + "step": 4780 + }, + { + "epoch": 3.717500970120295, + "grad_norm": 0.7130028009414673, + "learning_rate": 0.0002, + "loss": 1.3933, + "step": 4790 + }, + { + "epoch": 3.7252619324796274, + "grad_norm": 0.7501602172851562, + "learning_rate": 0.0002, + "loss": 1.4632, + "step": 4800 + }, + { + "epoch": 3.73302289483896, + "grad_norm": 0.6980932950973511, + "learning_rate": 0.0002, + "loss": 1.4985, + "step": 4810 + }, + { + "epoch": 3.7407838571982923, + "grad_norm": 0.8050530552864075, + "learning_rate": 0.0002, + "loss": 1.4517, + "step": 4820 + }, + { + "epoch": 3.748544819557625, + "grad_norm": 0.6385579705238342, + "learning_rate": 0.0002, + "loss": 1.4703, + "step": 4830 + }, + { + "epoch": 3.7563057819169576, + "grad_norm": 0.6664714813232422, + "learning_rate": 0.0002, + "loss": 1.5281, + "step": 4840 + }, + { + "epoch": 3.76406674427629, + "grad_norm": 0.7125676274299622, + "learning_rate": 0.0002, + "loss": 1.4443, + "step": 4850 + }, + { + "epoch": 3.771827706635623, + "grad_norm": 0.7231866717338562, + "learning_rate": 0.0002, + "loss": 1.3958, + "step": 4860 + }, + { + "epoch": 3.7795886689949554, + "grad_norm": 0.6917183995246887, + "learning_rate": 0.0002, + "loss": 1.4446, + "step": 4870 + }, + { + "epoch": 3.787349631354288, + "grad_norm": 0.665037989616394, + "learning_rate": 0.0002, + "loss": 1.4369, + "step": 4880 + }, + { + "epoch": 3.7951105937136207, + "grad_norm": 0.5837726593017578, + "learning_rate": 0.0002, + "loss": 1.4193, + "step": 4890 + }, + { + "epoch": 3.802871556072953, + "grad_norm": 0.6366701722145081, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 4900 + }, + { + "epoch": 3.8106325184322856, + "grad_norm": 0.7082223892211914, + "learning_rate": 0.0002, + "loss": 1.46, + "step": 4910 + }, + { + "epoch": 3.818393480791618, + "grad_norm": 0.8101672530174255, + "learning_rate": 0.0002, + "loss": 1.5139, + "step": 4920 + }, + { + "epoch": 3.826154443150951, + "grad_norm": 0.7516148090362549, + "learning_rate": 0.0002, + "loss": 1.3659, + "step": 4930 + }, + { + "epoch": 3.8339154055102833, + "grad_norm": 0.7928489446640015, + "learning_rate": 0.0002, + "loss": 1.3909, + "step": 4940 + }, + { + "epoch": 3.8416763678696157, + "grad_norm": 0.6892234683036804, + "learning_rate": 0.0002, + "loss": 1.4255, + "step": 4950 + }, + { + "epoch": 3.849437330228948, + "grad_norm": 0.6381304264068604, + "learning_rate": 0.0002, + "loss": 1.5024, + "step": 4960 + }, + { + "epoch": 3.857198292588281, + "grad_norm": 0.8068831562995911, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 4970 + }, + { + "epoch": 3.8649592549476135, + "grad_norm": 0.7289869785308838, + "learning_rate": 0.0002, + "loss": 1.45, + "step": 4980 + }, + { + "epoch": 3.872720217306946, + "grad_norm": 0.7278549075126648, + "learning_rate": 0.0002, + "loss": 1.398, + "step": 4990 + }, + { + "epoch": 3.880481179666279, + "grad_norm": 0.7324236631393433, + "learning_rate": 0.0002, + "loss": 1.4442, + "step": 5000 + }, + { + "epoch": 3.8882421420256112, + "grad_norm": 0.6759871244430542, + "learning_rate": 0.0002, + "loss": 1.4511, + "step": 5010 + }, + { + "epoch": 3.8960031043849437, + "grad_norm": 0.8159207701683044, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 5020 + }, + { + "epoch": 3.9037640667442766, + "grad_norm": 0.6536211967468262, + "learning_rate": 0.0002, + "loss": 1.4685, + "step": 5030 + }, + { + "epoch": 3.911525029103609, + "grad_norm": 0.6827932000160217, + "learning_rate": 0.0002, + "loss": 1.4335, + "step": 5040 + }, + { + "epoch": 3.9192859914629414, + "grad_norm": 0.6688340306282043, + "learning_rate": 0.0002, + "loss": 1.433, + "step": 5050 + }, + { + "epoch": 3.927046953822274, + "grad_norm": 0.6385695934295654, + "learning_rate": 0.0002, + "loss": 1.4099, + "step": 5060 + }, + { + "epoch": 3.9348079161816063, + "grad_norm": 0.6975107192993164, + "learning_rate": 0.0002, + "loss": 1.4767, + "step": 5070 + }, + { + "epoch": 3.942568878540939, + "grad_norm": 0.6684112548828125, + "learning_rate": 0.0002, + "loss": 1.4893, + "step": 5080 + }, + { + "epoch": 3.9503298409002716, + "grad_norm": 0.8349628448486328, + "learning_rate": 0.0002, + "loss": 1.4732, + "step": 5090 + }, + { + "epoch": 3.958090803259604, + "grad_norm": 0.7146425843238831, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 5100 + }, + { + "epoch": 3.965851765618937, + "grad_norm": 0.6555036902427673, + "learning_rate": 0.0002, + "loss": 1.4149, + "step": 5110 + }, + { + "epoch": 3.9736127279782694, + "grad_norm": 0.7037415504455566, + "learning_rate": 0.0002, + "loss": 1.4274, + "step": 5120 + }, + { + "epoch": 3.981373690337602, + "grad_norm": 0.7235575914382935, + "learning_rate": 0.0002, + "loss": 1.4292, + "step": 5130 + }, + { + "epoch": 3.9891346526969347, + "grad_norm": 0.7092325687408447, + "learning_rate": 0.0002, + "loss": 1.4455, + "step": 5140 + }, + { + "epoch": 3.996895615056267, + "grad_norm": 0.7490319609642029, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 5150 + }, + { + "epoch": 4.0, + "eval_loss": 1.9131355285644531, + "eval_runtime": 105.5778, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 0.606, + "step": 5154 + }, + { + "epoch": 4.0046565774155995, + "grad_norm": 0.7075854539871216, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 5160 + }, + { + "epoch": 4.012417539774932, + "grad_norm": 0.9466007351875305, + "learning_rate": 0.0002, + "loss": 1.209, + "step": 5170 + }, + { + "epoch": 4.020178502134264, + "grad_norm": 1.0297044515609741, + "learning_rate": 0.0002, + "loss": 1.2567, + "step": 5180 + }, + { + "epoch": 4.027939464493597, + "grad_norm": 0.7765059471130371, + "learning_rate": 0.0002, + "loss": 1.1796, + "step": 5190 + }, + { + "epoch": 4.03570042685293, + "grad_norm": 0.995760977268219, + "learning_rate": 0.0002, + "loss": 1.2356, + "step": 5200 + }, + { + "epoch": 4.043461389212262, + "grad_norm": 0.8663829565048218, + "learning_rate": 0.0002, + "loss": 1.1792, + "step": 5210 + }, + { + "epoch": 4.051222351571595, + "grad_norm": 1.0660825967788696, + "learning_rate": 0.0002, + "loss": 1.2471, + "step": 5220 + }, + { + "epoch": 4.058983313930927, + "grad_norm": 0.9858174920082092, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 5230 + }, + { + "epoch": 4.06674427629026, + "grad_norm": 0.8911338448524475, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 5240 + }, + { + "epoch": 4.074505238649593, + "grad_norm": 1.0848394632339478, + "learning_rate": 0.0002, + "loss": 1.1858, + "step": 5250 + }, + { + "epoch": 4.082266201008925, + "grad_norm": 1.0849905014038086, + "learning_rate": 0.0002, + "loss": 1.1684, + "step": 5260 + }, + { + "epoch": 4.090027163368258, + "grad_norm": 1.0497841835021973, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 5270 + }, + { + "epoch": 4.0977881257275905, + "grad_norm": 0.8943053483963013, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 5280 + }, + { + "epoch": 4.1055490880869225, + "grad_norm": 0.8432527184486389, + "learning_rate": 0.0002, + "loss": 1.1923, + "step": 5290 + }, + { + "epoch": 4.113310050446255, + "grad_norm": 0.9690414667129517, + "learning_rate": 0.0002, + "loss": 1.1634, + "step": 5300 + }, + { + "epoch": 4.121071012805588, + "grad_norm": 0.7790773510932922, + "learning_rate": 0.0002, + "loss": 1.3019, + "step": 5310 + }, + { + "epoch": 4.12883197516492, + "grad_norm": 0.9289211630821228, + "learning_rate": 0.0002, + "loss": 1.1806, + "step": 5320 + }, + { + "epoch": 4.136592937524253, + "grad_norm": 1.0785125494003296, + "learning_rate": 0.0002, + "loss": 1.1458, + "step": 5330 + }, + { + "epoch": 4.144353899883585, + "grad_norm": 0.8559591770172119, + "learning_rate": 0.0002, + "loss": 1.2086, + "step": 5340 + }, + { + "epoch": 4.152114862242918, + "grad_norm": 0.9405956268310547, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 5350 + }, + { + "epoch": 4.159875824602251, + "grad_norm": 0.9942827820777893, + "learning_rate": 0.0002, + "loss": 1.1793, + "step": 5360 + }, + { + "epoch": 4.167636786961583, + "grad_norm": 0.9141933917999268, + "learning_rate": 0.0002, + "loss": 1.1659, + "step": 5370 + }, + { + "epoch": 4.175397749320916, + "grad_norm": 0.8206015229225159, + "learning_rate": 0.0002, + "loss": 1.1647, + "step": 5380 + }, + { + "epoch": 4.183158711680249, + "grad_norm": 0.9340888857841492, + "learning_rate": 0.0002, + "loss": 1.2778, + "step": 5390 + }, + { + "epoch": 4.190919674039581, + "grad_norm": 1.2122114896774292, + "learning_rate": 0.0002, + "loss": 1.2459, + "step": 5400 + }, + { + "epoch": 4.1986806363989135, + "grad_norm": 1.0661298036575317, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 5410 + }, + { + "epoch": 4.206441598758246, + "grad_norm": 0.9372861385345459, + "learning_rate": 0.0002, + "loss": 1.1978, + "step": 5420 + }, + { + "epoch": 4.214202561117578, + "grad_norm": 0.894012987613678, + "learning_rate": 0.0002, + "loss": 1.2653, + "step": 5430 + }, + { + "epoch": 4.221963523476911, + "grad_norm": 1.0647753477096558, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 5440 + }, + { + "epoch": 4.229724485836243, + "grad_norm": 0.989179790019989, + "learning_rate": 0.0002, + "loss": 1.2231, + "step": 5450 + }, + { + "epoch": 4.237485448195576, + "grad_norm": 1.1601181030273438, + "learning_rate": 0.0002, + "loss": 1.2715, + "step": 5460 + }, + { + "epoch": 4.245246410554909, + "grad_norm": 0.9395585656166077, + "learning_rate": 0.0002, + "loss": 1.2406, + "step": 5470 + }, + { + "epoch": 4.253007372914241, + "grad_norm": 0.9527766108512878, + "learning_rate": 0.0002, + "loss": 1.2779, + "step": 5480 + }, + { + "epoch": 4.260768335273574, + "grad_norm": 1.0319520235061646, + "learning_rate": 0.0002, + "loss": 1.267, + "step": 5490 + }, + { + "epoch": 4.268529297632907, + "grad_norm": 0.8659824728965759, + "learning_rate": 0.0002, + "loss": 1.2633, + "step": 5500 + }, + { + "epoch": 4.276290259992239, + "grad_norm": 1.099211573600769, + "learning_rate": 0.0002, + "loss": 1.1475, + "step": 5510 + }, + { + "epoch": 4.284051222351572, + "grad_norm": 0.9363361597061157, + "learning_rate": 0.0002, + "loss": 1.2508, + "step": 5520 + }, + { + "epoch": 4.2918121847109045, + "grad_norm": 0.8437647223472595, + "learning_rate": 0.0002, + "loss": 1.189, + "step": 5530 + }, + { + "epoch": 4.2995731470702365, + "grad_norm": 0.9181258678436279, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 5540 + }, + { + "epoch": 4.307334109429569, + "grad_norm": 0.9059357643127441, + "learning_rate": 0.0002, + "loss": 1.2092, + "step": 5550 + }, + { + "epoch": 4.315095071788901, + "grad_norm": 0.9337241649627686, + "learning_rate": 0.0002, + "loss": 1.2189, + "step": 5560 + }, + { + "epoch": 4.322856034148234, + "grad_norm": 0.9428889155387878, + "learning_rate": 0.0002, + "loss": 1.2462, + "step": 5570 + }, + { + "epoch": 4.330616996507567, + "grad_norm": 1.003589153289795, + "learning_rate": 0.0002, + "loss": 1.2675, + "step": 5580 + }, + { + "epoch": 4.338377958866899, + "grad_norm": 1.1249268054962158, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 5590 + }, + { + "epoch": 4.346138921226232, + "grad_norm": 0.8623469471931458, + "learning_rate": 0.0002, + "loss": 1.2501, + "step": 5600 + }, + { + "epoch": 4.353899883585565, + "grad_norm": 1.1389174461364746, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 5610 + }, + { + "epoch": 4.361660845944897, + "grad_norm": 1.0136264562606812, + "learning_rate": 0.0002, + "loss": 1.2245, + "step": 5620 + }, + { + "epoch": 4.36942180830423, + "grad_norm": 0.9567070603370667, + "learning_rate": 0.0002, + "loss": 1.3473, + "step": 5630 + }, + { + "epoch": 4.377182770663563, + "grad_norm": 1.0592148303985596, + "learning_rate": 0.0002, + "loss": 1.2988, + "step": 5640 + }, + { + "epoch": 4.384943733022895, + "grad_norm": 1.0110485553741455, + "learning_rate": 0.0002, + "loss": 1.212, + "step": 5650 + }, + { + "epoch": 4.3927046953822275, + "grad_norm": 0.9914907217025757, + "learning_rate": 0.0002, + "loss": 1.2086, + "step": 5660 + }, + { + "epoch": 4.4004656577415595, + "grad_norm": 0.9447247982025146, + "learning_rate": 0.0002, + "loss": 1.2363, + "step": 5670 + }, + { + "epoch": 4.408226620100892, + "grad_norm": 0.9644378423690796, + "learning_rate": 0.0002, + "loss": 1.2617, + "step": 5680 + }, + { + "epoch": 4.415987582460225, + "grad_norm": 0.920676589012146, + "learning_rate": 0.0002, + "loss": 1.2773, + "step": 5690 + }, + { + "epoch": 4.423748544819557, + "grad_norm": 1.060570478439331, + "learning_rate": 0.0002, + "loss": 1.2792, + "step": 5700 + }, + { + "epoch": 4.43150950717889, + "grad_norm": 0.8857738971710205, + "learning_rate": 0.0002, + "loss": 1.2374, + "step": 5710 + }, + { + "epoch": 4.439270469538223, + "grad_norm": 1.0536398887634277, + "learning_rate": 0.0002, + "loss": 1.2588, + "step": 5720 + }, + { + "epoch": 4.447031431897555, + "grad_norm": 0.990847110748291, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 5730 + }, + { + "epoch": 4.454792394256888, + "grad_norm": 0.9692499041557312, + "learning_rate": 0.0002, + "loss": 1.2469, + "step": 5740 + }, + { + "epoch": 4.462553356616221, + "grad_norm": 1.0376402139663696, + "learning_rate": 0.0002, + "loss": 1.2269, + "step": 5750 + }, + { + "epoch": 4.470314318975553, + "grad_norm": 1.3863259553909302, + "learning_rate": 0.0002, + "loss": 1.1701, + "step": 5760 + }, + { + "epoch": 4.478075281334886, + "grad_norm": 0.978379487991333, + "learning_rate": 0.0002, + "loss": 1.2591, + "step": 5770 + }, + { + "epoch": 4.485836243694218, + "grad_norm": 1.0973085165023804, + "learning_rate": 0.0002, + "loss": 1.2729, + "step": 5780 + }, + { + "epoch": 4.4935972060535505, + "grad_norm": 1.057006597518921, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 5790 + }, + { + "epoch": 4.501358168412883, + "grad_norm": 0.9247729182243347, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 5800 + }, + { + "epoch": 4.509119130772215, + "grad_norm": 1.0447787046432495, + "learning_rate": 0.0002, + "loss": 1.2369, + "step": 5810 + }, + { + "epoch": 4.516880093131548, + "grad_norm": 1.1930429935455322, + "learning_rate": 0.0002, + "loss": 1.211, + "step": 5820 + }, + { + "epoch": 4.524641055490881, + "grad_norm": 0.9867590069770813, + "learning_rate": 0.0002, + "loss": 1.2596, + "step": 5830 + }, + { + "epoch": 4.532402017850213, + "grad_norm": 0.9591100215911865, + "learning_rate": 0.0002, + "loss": 1.2766, + "step": 5840 + }, + { + "epoch": 4.540162980209546, + "grad_norm": 0.9950753450393677, + "learning_rate": 0.0002, + "loss": 1.2154, + "step": 5850 + }, + { + "epoch": 4.547923942568879, + "grad_norm": 1.0087506771087646, + "learning_rate": 0.0002, + "loss": 1.2149, + "step": 5860 + }, + { + "epoch": 4.555684904928211, + "grad_norm": 1.0934417247772217, + "learning_rate": 0.0002, + "loss": 1.3165, + "step": 5870 + }, + { + "epoch": 4.563445867287544, + "grad_norm": 1.107987403869629, + "learning_rate": 0.0002, + "loss": 1.3059, + "step": 5880 + }, + { + "epoch": 4.571206829646876, + "grad_norm": 0.9147276878356934, + "learning_rate": 0.0002, + "loss": 1.2184, + "step": 5890 + }, + { + "epoch": 4.578967792006209, + "grad_norm": 1.036780595779419, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 5900 + }, + { + "epoch": 4.5867287543655415, + "grad_norm": 0.9284719824790955, + "learning_rate": 0.0002, + "loss": 1.2209, + "step": 5910 + }, + { + "epoch": 4.5944897167248735, + "grad_norm": 0.9141898155212402, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 5920 + }, + { + "epoch": 4.602250679084206, + "grad_norm": 1.0447357892990112, + "learning_rate": 0.0002, + "loss": 1.2319, + "step": 5930 + }, + { + "epoch": 4.610011641443539, + "grad_norm": 0.9309114217758179, + "learning_rate": 0.0002, + "loss": 1.2667, + "step": 5940 + }, + { + "epoch": 4.617772603802871, + "grad_norm": 1.2986129522323608, + "learning_rate": 0.0002, + "loss": 1.2827, + "step": 5950 + }, + { + "epoch": 4.625533566162204, + "grad_norm": 0.9221704602241516, + "learning_rate": 0.0002, + "loss": 1.312, + "step": 5960 + }, + { + "epoch": 4.633294528521537, + "grad_norm": 0.9228187799453735, + "learning_rate": 0.0002, + "loss": 1.2769, + "step": 5970 + }, + { + "epoch": 4.641055490880869, + "grad_norm": 0.9483116269111633, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 5980 + }, + { + "epoch": 4.648816453240202, + "grad_norm": 1.0218974351882935, + "learning_rate": 0.0002, + "loss": 1.3437, + "step": 5990 + }, + { + "epoch": 4.656577415599534, + "grad_norm": 0.9764600396156311, + "learning_rate": 0.0002, + "loss": 1.3085, + "step": 6000 + }, + { + "epoch": 4.664338377958867, + "grad_norm": 0.9115710258483887, + "learning_rate": 0.0002, + "loss": 1.197, + "step": 6010 + }, + { + "epoch": 4.6720993403182, + "grad_norm": 0.9245651364326477, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 6020 + }, + { + "epoch": 4.6798603026775325, + "grad_norm": 0.9686311483383179, + "learning_rate": 0.0002, + "loss": 1.2969, + "step": 6030 + }, + { + "epoch": 4.6876212650368645, + "grad_norm": 1.1807392835617065, + "learning_rate": 0.0002, + "loss": 1.2702, + "step": 6040 + }, + { + "epoch": 4.695382227396197, + "grad_norm": 1.0358641147613525, + "learning_rate": 0.0002, + "loss": 1.328, + "step": 6050 + }, + { + "epoch": 4.703143189755529, + "grad_norm": 0.987332284450531, + "learning_rate": 0.0002, + "loss": 1.3281, + "step": 6060 + }, + { + "epoch": 4.710904152114862, + "grad_norm": 1.0526494979858398, + "learning_rate": 0.0002, + "loss": 1.2514, + "step": 6070 + }, + { + "epoch": 4.718665114474195, + "grad_norm": 1.0276758670806885, + "learning_rate": 0.0002, + "loss": 1.2246, + "step": 6080 + }, + { + "epoch": 4.726426076833527, + "grad_norm": 0.9904406666755676, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 6090 + }, + { + "epoch": 4.73418703919286, + "grad_norm": 1.0084882974624634, + "learning_rate": 0.0002, + "loss": 1.2797, + "step": 6100 + }, + { + "epoch": 4.741948001552192, + "grad_norm": 0.8646450638771057, + "learning_rate": 0.0002, + "loss": 1.2656, + "step": 6110 + }, + { + "epoch": 4.749708963911525, + "grad_norm": 0.9233377575874329, + "learning_rate": 0.0002, + "loss": 1.3063, + "step": 6120 + }, + { + "epoch": 4.757469926270858, + "grad_norm": 0.9675140976905823, + "learning_rate": 0.0002, + "loss": 1.2642, + "step": 6130 + }, + { + "epoch": 4.765230888630191, + "grad_norm": 0.9639796018600464, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 6140 + }, + { + "epoch": 4.772991850989523, + "grad_norm": 0.925199568271637, + "learning_rate": 0.0002, + "loss": 1.276, + "step": 6150 + }, + { + "epoch": 4.7807528133488555, + "grad_norm": 1.050901174545288, + "learning_rate": 0.0002, + "loss": 1.2441, + "step": 6160 + }, + { + "epoch": 4.7885137757081875, + "grad_norm": 0.8920623660087585, + "learning_rate": 0.0002, + "loss": 1.301, + "step": 6170 + }, + { + "epoch": 4.79627473806752, + "grad_norm": 0.8964757919311523, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 6180 + }, + { + "epoch": 4.804035700426853, + "grad_norm": 1.0839070081710815, + "learning_rate": 0.0002, + "loss": 1.2787, + "step": 6190 + }, + { + "epoch": 4.811796662786185, + "grad_norm": 0.8809942007064819, + "learning_rate": 0.0002, + "loss": 1.2664, + "step": 6200 + }, + { + "epoch": 4.819557625145518, + "grad_norm": 1.0216195583343506, + "learning_rate": 0.0002, + "loss": 1.321, + "step": 6210 + }, + { + "epoch": 4.827318587504851, + "grad_norm": 0.892005980014801, + "learning_rate": 0.0002, + "loss": 1.3033, + "step": 6220 + }, + { + "epoch": 4.835079549864183, + "grad_norm": 0.9957166910171509, + "learning_rate": 0.0002, + "loss": 1.2602, + "step": 6230 + }, + { + "epoch": 4.842840512223516, + "grad_norm": 0.9720533490180969, + "learning_rate": 0.0002, + "loss": 1.3562, + "step": 6240 + }, + { + "epoch": 4.850601474582849, + "grad_norm": 0.9336182475090027, + "learning_rate": 0.0002, + "loss": 1.2651, + "step": 6250 + }, + { + "epoch": 4.858362436942181, + "grad_norm": 1.2611457109451294, + "learning_rate": 0.0002, + "loss": 1.3136, + "step": 6260 + }, + { + "epoch": 4.866123399301514, + "grad_norm": 0.8927203416824341, + "learning_rate": 0.0002, + "loss": 1.2234, + "step": 6270 + }, + { + "epoch": 4.873884361660846, + "grad_norm": 0.9706710577011108, + "learning_rate": 0.0002, + "loss": 1.3463, + "step": 6280 + }, + { + "epoch": 4.8816453240201785, + "grad_norm": 1.1461690664291382, + "learning_rate": 0.0002, + "loss": 1.3209, + "step": 6290 + }, + { + "epoch": 4.889406286379511, + "grad_norm": 0.9930381178855896, + "learning_rate": 0.0002, + "loss": 1.2566, + "step": 6300 + }, + { + "epoch": 4.897167248738843, + "grad_norm": 0.91451096534729, + "learning_rate": 0.0002, + "loss": 1.2568, + "step": 6310 + }, + { + "epoch": 4.904928211098176, + "grad_norm": 1.0319571495056152, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 6320 + }, + { + "epoch": 4.912689173457509, + "grad_norm": 0.990140438079834, + "learning_rate": 0.0002, + "loss": 1.2908, + "step": 6330 + }, + { + "epoch": 4.920450135816841, + "grad_norm": 1.2466117143630981, + "learning_rate": 0.0002, + "loss": 1.3299, + "step": 6340 + }, + { + "epoch": 4.928211098176174, + "grad_norm": 1.0316979885101318, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 6350 + }, + { + "epoch": 4.935972060535507, + "grad_norm": 1.0643759965896606, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 6360 + }, + { + "epoch": 4.943733022894839, + "grad_norm": 0.9703279733657837, + "learning_rate": 0.0002, + "loss": 1.2559, + "step": 6370 + }, + { + "epoch": 4.951493985254172, + "grad_norm": 0.9767927527427673, + "learning_rate": 0.0002, + "loss": 1.2155, + "step": 6380 + }, + { + "epoch": 4.959254947613504, + "grad_norm": 0.960854172706604, + "learning_rate": 0.0002, + "loss": 1.2437, + "step": 6390 + }, + { + "epoch": 4.967015909972837, + "grad_norm": 0.9922910332679749, + "learning_rate": 0.0002, + "loss": 1.3314, + "step": 6400 + }, + { + "epoch": 4.9747768723321695, + "grad_norm": 0.956470787525177, + "learning_rate": 0.0002, + "loss": 1.3018, + "step": 6410 + }, + { + "epoch": 4.9825378346915015, + "grad_norm": 0.9637242555618286, + "learning_rate": 0.0002, + "loss": 1.2794, + "step": 6420 + }, + { + "epoch": 4.990298797050834, + "grad_norm": 1.0855202674865723, + "learning_rate": 0.0002, + "loss": 1.3236, + "step": 6430 + }, + { + "epoch": 4.998059759410167, + "grad_norm": 0.9655316472053528, + "learning_rate": 0.0002, + "loss": 1.3015, + "step": 6440 + }, + { + "epoch": 4.9996119518820334, + "eval_loss": 2.0410802364349365, + "eval_runtime": 113.04, + "eval_samples_per_second": 4.485, + "eval_steps_per_second": 0.566, + "step": 6442 + }, + { + "epoch": 5.005820721769499, + "grad_norm": 1.1676199436187744, + "learning_rate": 0.0002, + "loss": 1.0846, + "step": 6450 + }, + { + "epoch": 5.013581684128832, + "grad_norm": 1.4317965507507324, + "learning_rate": 0.0002, + "loss": 1.041, + "step": 6460 + }, + { + "epoch": 5.021342646488165, + "grad_norm": 1.460443377494812, + "learning_rate": 0.0002, + "loss": 0.9546, + "step": 6470 + }, + { + "epoch": 5.029103608847497, + "grad_norm": 1.2299214601516724, + "learning_rate": 0.0002, + "loss": 1.0014, + "step": 6480 + }, + { + "epoch": 5.03686457120683, + "grad_norm": 1.3125724792480469, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 6490 + }, + { + "epoch": 5.044625533566162, + "grad_norm": 1.1252319812774658, + "learning_rate": 0.0002, + "loss": 1.0134, + "step": 6500 + }, + { + "epoch": 5.052386495925495, + "grad_norm": 0.9970866441726685, + "learning_rate": 0.0002, + "loss": 0.976, + "step": 6510 + }, + { + "epoch": 5.060147458284828, + "grad_norm": 1.229069709777832, + "learning_rate": 0.0002, + "loss": 0.9731, + "step": 6520 + }, + { + "epoch": 5.06790842064416, + "grad_norm": 1.2430938482284546, + "learning_rate": 0.0002, + "loss": 1.0498, + "step": 6530 + }, + { + "epoch": 5.0756693830034925, + "grad_norm": 1.0522737503051758, + "learning_rate": 0.0002, + "loss": 1.0236, + "step": 6540 + }, + { + "epoch": 5.083430345362825, + "grad_norm": 1.108890175819397, + "learning_rate": 0.0002, + "loss": 1.0221, + "step": 6550 + }, + { + "epoch": 5.091191307722157, + "grad_norm": 1.156912922859192, + "learning_rate": 0.0002, + "loss": 1.0177, + "step": 6560 + }, + { + "epoch": 5.09895227008149, + "grad_norm": 1.405895709991455, + "learning_rate": 0.0002, + "loss": 1.0415, + "step": 6570 + }, + { + "epoch": 5.106713232440823, + "grad_norm": 1.2005155086517334, + "learning_rate": 0.0002, + "loss": 0.9811, + "step": 6580 + }, + { + "epoch": 5.114474194800155, + "grad_norm": 1.181443452835083, + "learning_rate": 0.0002, + "loss": 0.9862, + "step": 6590 + }, + { + "epoch": 5.122235157159488, + "grad_norm": 2.3444771766662598, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 6600 + }, + { + "epoch": 5.12999611951882, + "grad_norm": 1.216988444328308, + "learning_rate": 0.0002, + "loss": 1.0455, + "step": 6610 + }, + { + "epoch": 5.137757081878153, + "grad_norm": 1.369553565979004, + "learning_rate": 0.0002, + "loss": 1.0549, + "step": 6620 + }, + { + "epoch": 5.145518044237486, + "grad_norm": 1.177964687347412, + "learning_rate": 0.0002, + "loss": 1.0056, + "step": 6630 + }, + { + "epoch": 5.153279006596818, + "grad_norm": 1.1397041082382202, + "learning_rate": 0.0002, + "loss": 1.1025, + "step": 6640 + }, + { + "epoch": 5.161039968956151, + "grad_norm": 1.3976861238479614, + "learning_rate": 0.0002, + "loss": 1.0437, + "step": 6650 + }, + { + "epoch": 5.1688009313154835, + "grad_norm": 1.4824495315551758, + "learning_rate": 0.0002, + "loss": 1.0454, + "step": 6660 + }, + { + "epoch": 5.1765618936748155, + "grad_norm": 1.2653018236160278, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 6670 + }, + { + "epoch": 5.184322856034148, + "grad_norm": 1.3106069564819336, + "learning_rate": 0.0002, + "loss": 0.9971, + "step": 6680 + }, + { + "epoch": 5.192083818393481, + "grad_norm": 1.3140279054641724, + "learning_rate": 0.0002, + "loss": 1.0561, + "step": 6690 + }, + { + "epoch": 5.199844780752813, + "grad_norm": 1.3900256156921387, + "learning_rate": 0.0002, + "loss": 1.0618, + "step": 6700 + }, + { + "epoch": 5.207605743112146, + "grad_norm": 1.3191124200820923, + "learning_rate": 0.0002, + "loss": 1.0285, + "step": 6710 + }, + { + "epoch": 5.215366705471478, + "grad_norm": 1.176107406616211, + "learning_rate": 0.0002, + "loss": 0.9921, + "step": 6720 + }, + { + "epoch": 5.223127667830811, + "grad_norm": 1.2364883422851562, + "learning_rate": 0.0002, + "loss": 1.064, + "step": 6730 + }, + { + "epoch": 5.230888630190144, + "grad_norm": 1.343022108078003, + "learning_rate": 0.0002, + "loss": 0.9599, + "step": 6740 + }, + { + "epoch": 5.238649592549476, + "grad_norm": 1.2826898097991943, + "learning_rate": 0.0002, + "loss": 1.0342, + "step": 6750 + }, + { + "epoch": 5.246410554908809, + "grad_norm": 1.500257134437561, + "learning_rate": 0.0002, + "loss": 1.0703, + "step": 6760 + }, + { + "epoch": 5.254171517268142, + "grad_norm": 1.2605743408203125, + "learning_rate": 0.0002, + "loss": 1.0114, + "step": 6770 + }, + { + "epoch": 5.261932479627474, + "grad_norm": 1.2355525493621826, + "learning_rate": 0.0002, + "loss": 1.0825, + "step": 6780 + }, + { + "epoch": 5.2696934419868064, + "grad_norm": 1.2845789194107056, + "learning_rate": 0.0002, + "loss": 1.0436, + "step": 6790 + }, + { + "epoch": 5.277454404346139, + "grad_norm": 1.3696625232696533, + "learning_rate": 0.0002, + "loss": 0.989, + "step": 6800 + }, + { + "epoch": 5.285215366705471, + "grad_norm": 1.4051260948181152, + "learning_rate": 0.0002, + "loss": 1.0991, + "step": 6810 + }, + { + "epoch": 5.292976329064804, + "grad_norm": 1.266725778579712, + "learning_rate": 0.0002, + "loss": 1.0987, + "step": 6820 + }, + { + "epoch": 5.300737291424136, + "grad_norm": 1.3475236892700195, + "learning_rate": 0.0002, + "loss": 1.0489, + "step": 6830 + }, + { + "epoch": 5.308498253783469, + "grad_norm": 1.54409921169281, + "learning_rate": 0.0002, + "loss": 1.0264, + "step": 6840 + }, + { + "epoch": 5.316259216142802, + "grad_norm": 1.2391985654830933, + "learning_rate": 0.0002, + "loss": 1.033, + "step": 6850 + }, + { + "epoch": 5.324020178502134, + "grad_norm": 1.2435699701309204, + "learning_rate": 0.0002, + "loss": 1.1058, + "step": 6860 + }, + { + "epoch": 5.331781140861467, + "grad_norm": 1.8803037405014038, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 6870 + }, + { + "epoch": 5.3395421032208, + "grad_norm": 1.4195542335510254, + "learning_rate": 0.0002, + "loss": 0.997, + "step": 6880 + }, + { + "epoch": 5.347303065580132, + "grad_norm": 1.1853394508361816, + "learning_rate": 0.0002, + "loss": 1.0273, + "step": 6890 + }, + { + "epoch": 5.355064027939465, + "grad_norm": 1.4016530513763428, + "learning_rate": 0.0002, + "loss": 1.0668, + "step": 6900 + }, + { + "epoch": 5.3628249902987974, + "grad_norm": 1.294339895248413, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 6910 + }, + { + "epoch": 5.370585952658129, + "grad_norm": 1.2952708005905151, + "learning_rate": 0.0002, + "loss": 1.0724, + "step": 6920 + }, + { + "epoch": 5.378346915017462, + "grad_norm": 1.1361510753631592, + "learning_rate": 0.0002, + "loss": 1.0098, + "step": 6930 + }, + { + "epoch": 5.386107877376794, + "grad_norm": 1.125805377960205, + "learning_rate": 0.0002, + "loss": 1.0796, + "step": 6940 + }, + { + "epoch": 5.393868839736127, + "grad_norm": 1.1453300714492798, + "learning_rate": 0.0002, + "loss": 1.122, + "step": 6950 + }, + { + "epoch": 5.40162980209546, + "grad_norm": 1.4542768001556396, + "learning_rate": 0.0002, + "loss": 1.0977, + "step": 6960 + }, + { + "epoch": 5.409390764454792, + "grad_norm": 1.2360988855361938, + "learning_rate": 0.0002, + "loss": 1.0825, + "step": 6970 + }, + { + "epoch": 5.417151726814125, + "grad_norm": 1.2182754278182983, + "learning_rate": 0.0002, + "loss": 1.0631, + "step": 6980 + }, + { + "epoch": 5.424912689173458, + "grad_norm": 1.2018693685531616, + "learning_rate": 0.0002, + "loss": 1.0471, + "step": 6990 + }, + { + "epoch": 5.43267365153279, + "grad_norm": 1.346124291419983, + "learning_rate": 0.0002, + "loss": 1.108, + "step": 7000 + }, + { + "epoch": 5.440434613892123, + "grad_norm": 1.2534189224243164, + "learning_rate": 0.0002, + "loss": 1.0534, + "step": 7010 + }, + { + "epoch": 5.448195576251456, + "grad_norm": 1.2033339738845825, + "learning_rate": 0.0002, + "loss": 1.0696, + "step": 7020 + }, + { + "epoch": 5.4559565386107876, + "grad_norm": 1.2788134813308716, + "learning_rate": 0.0002, + "loss": 1.0714, + "step": 7030 + }, + { + "epoch": 5.46371750097012, + "grad_norm": 1.2751542329788208, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 7040 + }, + { + "epoch": 5.471478463329452, + "grad_norm": 1.3237019777297974, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 7050 + }, + { + "epoch": 5.479239425688785, + "grad_norm": 1.4932852983474731, + "learning_rate": 0.0002, + "loss": 1.1081, + "step": 7060 + }, + { + "epoch": 5.487000388048118, + "grad_norm": 1.4003876447677612, + "learning_rate": 0.0002, + "loss": 1.0197, + "step": 7070 + }, + { + "epoch": 5.49476135040745, + "grad_norm": 1.404799461364746, + "learning_rate": 0.0002, + "loss": 1.0662, + "step": 7080 + }, + { + "epoch": 5.502522312766783, + "grad_norm": 1.4486982822418213, + "learning_rate": 0.0002, + "loss": 1.0354, + "step": 7090 + }, + { + "epoch": 5.510283275126116, + "grad_norm": 1.1713480949401855, + "learning_rate": 0.0002, + "loss": 1.0645, + "step": 7100 + }, + { + "epoch": 5.518044237485448, + "grad_norm": 1.4062601327896118, + "learning_rate": 0.0002, + "loss": 1.006, + "step": 7110 + }, + { + "epoch": 5.525805199844781, + "grad_norm": 1.211629867553711, + "learning_rate": 0.0002, + "loss": 1.0459, + "step": 7120 + }, + { + "epoch": 5.533566162204114, + "grad_norm": 1.2523176670074463, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 7130 + }, + { + "epoch": 5.541327124563446, + "grad_norm": 1.4467198848724365, + "learning_rate": 0.0002, + "loss": 1.1132, + "step": 7140 + }, + { + "epoch": 5.5490880869227786, + "grad_norm": 1.5961614847183228, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 7150 + }, + { + "epoch": 5.5568490492821105, + "grad_norm": 1.320656418800354, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 7160 + }, + { + "epoch": 5.564610011641443, + "grad_norm": 1.2423332929611206, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 7170 + }, + { + "epoch": 5.572370974000776, + "grad_norm": 1.2919669151306152, + "learning_rate": 0.0002, + "loss": 1.0046, + "step": 7180 + }, + { + "epoch": 5.580131936360108, + "grad_norm": 1.1678385734558105, + "learning_rate": 0.0002, + "loss": 1.046, + "step": 7190 + }, + { + "epoch": 5.587892898719441, + "grad_norm": 1.4250764846801758, + "learning_rate": 0.0002, + "loss": 1.1011, + "step": 7200 + }, + { + "epoch": 5.595653861078774, + "grad_norm": 1.5308716297149658, + "learning_rate": 0.0002, + "loss": 1.1254, + "step": 7210 + }, + { + "epoch": 5.603414823438106, + "grad_norm": 1.2678815126419067, + "learning_rate": 0.0002, + "loss": 1.121, + "step": 7220 + }, + { + "epoch": 5.611175785797439, + "grad_norm": 1.127856969833374, + "learning_rate": 0.0002, + "loss": 1.0846, + "step": 7230 + }, + { + "epoch": 5.618936748156772, + "grad_norm": 1.3832560777664185, + "learning_rate": 0.0002, + "loss": 1.0647, + "step": 7240 + }, + { + "epoch": 5.626697710516104, + "grad_norm": 1.3226919174194336, + "learning_rate": 0.0002, + "loss": 1.0658, + "step": 7250 + }, + { + "epoch": 5.634458672875437, + "grad_norm": 1.3418006896972656, + "learning_rate": 0.0002, + "loss": 1.1175, + "step": 7260 + }, + { + "epoch": 5.642219635234769, + "grad_norm": 1.2625300884246826, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 7270 + }, + { + "epoch": 5.6499805975941015, + "grad_norm": 1.1579464673995972, + "learning_rate": 0.0002, + "loss": 1.067, + "step": 7280 + }, + { + "epoch": 5.657741559953434, + "grad_norm": 1.4998650550842285, + "learning_rate": 0.0002, + "loss": 1.0447, + "step": 7290 + }, + { + "epoch": 5.665502522312766, + "grad_norm": 1.2670758962631226, + "learning_rate": 0.0002, + "loss": 1.1256, + "step": 7300 + }, + { + "epoch": 5.673263484672099, + "grad_norm": 1.2959760427474976, + "learning_rate": 0.0002, + "loss": 1.1267, + "step": 7310 + }, + { + "epoch": 5.681024447031432, + "grad_norm": 1.2460671663284302, + "learning_rate": 0.0002, + "loss": 1.1387, + "step": 7320 + }, + { + "epoch": 5.688785409390764, + "grad_norm": 1.1313989162445068, + "learning_rate": 0.0002, + "loss": 1.0756, + "step": 7330 + }, + { + "epoch": 5.696546371750097, + "grad_norm": 1.282527208328247, + "learning_rate": 0.0002, + "loss": 1.0618, + "step": 7340 + }, + { + "epoch": 5.70430733410943, + "grad_norm": 1.3380206823349, + "learning_rate": 0.0002, + "loss": 1.1315, + "step": 7350 + }, + { + "epoch": 5.712068296468762, + "grad_norm": 1.1648279428482056, + "learning_rate": 0.0002, + "loss": 1.0949, + "step": 7360 + }, + { + "epoch": 5.719829258828095, + "grad_norm": 1.3059816360473633, + "learning_rate": 0.0002, + "loss": 1.1705, + "step": 7370 + }, + { + "epoch": 5.727590221187427, + "grad_norm": 1.1905046701431274, + "learning_rate": 0.0002, + "loss": 1.1496, + "step": 7380 + }, + { + "epoch": 5.73535118354676, + "grad_norm": 1.4089630842208862, + "learning_rate": 0.0002, + "loss": 1.1356, + "step": 7390 + }, + { + "epoch": 5.7431121459060925, + "grad_norm": 1.256721019744873, + "learning_rate": 0.0002, + "loss": 1.1349, + "step": 7400 + }, + { + "epoch": 5.7508731082654245, + "grad_norm": 1.1915162801742554, + "learning_rate": 0.0002, + "loss": 1.0682, + "step": 7410 + }, + { + "epoch": 5.758634070624757, + "grad_norm": 1.1935480833053589, + "learning_rate": 0.0002, + "loss": 1.1257, + "step": 7420 + }, + { + "epoch": 5.76639503298409, + "grad_norm": 1.1761008501052856, + "learning_rate": 0.0002, + "loss": 1.1348, + "step": 7430 + }, + { + "epoch": 5.774155995343422, + "grad_norm": 1.2540549039840698, + "learning_rate": 0.0002, + "loss": 1.0837, + "step": 7440 + }, + { + "epoch": 5.781916957702755, + "grad_norm": 1.5295120477676392, + "learning_rate": 0.0002, + "loss": 1.1527, + "step": 7450 + }, + { + "epoch": 5.789677920062088, + "grad_norm": 1.1081160306930542, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 7460 + }, + { + "epoch": 5.79743888242142, + "grad_norm": 1.4381253719329834, + "learning_rate": 0.0002, + "loss": 1.1304, + "step": 7470 + }, + { + "epoch": 5.805199844780753, + "grad_norm": 1.3079341650009155, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 7480 + }, + { + "epoch": 5.812960807140085, + "grad_norm": 1.1372792720794678, + "learning_rate": 0.0002, + "loss": 1.0544, + "step": 7490 + }, + { + "epoch": 5.820721769499418, + "grad_norm": 1.3221744298934937, + "learning_rate": 0.0002, + "loss": 1.1622, + "step": 7500 + }, + { + "epoch": 5.828482731858751, + "grad_norm": 1.3436939716339111, + "learning_rate": 0.0002, + "loss": 1.1515, + "step": 7510 + }, + { + "epoch": 5.8362436942180835, + "grad_norm": 1.3916879892349243, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 7520 + }, + { + "epoch": 5.8440046565774155, + "grad_norm": 1.2463704347610474, + "learning_rate": 0.0002, + "loss": 1.0816, + "step": 7530 + }, + { + "epoch": 5.851765618936748, + "grad_norm": 1.097051739692688, + "learning_rate": 0.0002, + "loss": 1.0745, + "step": 7540 + }, + { + "epoch": 5.85952658129608, + "grad_norm": 1.1554739475250244, + "learning_rate": 0.0002, + "loss": 1.1454, + "step": 7550 + }, + { + "epoch": 5.867287543655413, + "grad_norm": 1.2384694814682007, + "learning_rate": 0.0002, + "loss": 1.0953, + "step": 7560 + }, + { + "epoch": 5.875048506014746, + "grad_norm": 1.142815351486206, + "learning_rate": 0.0002, + "loss": 1.1734, + "step": 7570 + }, + { + "epoch": 5.882809468374078, + "grad_norm": 1.3637062311172485, + "learning_rate": 0.0002, + "loss": 1.162, + "step": 7580 + }, + { + "epoch": 5.890570430733411, + "grad_norm": 1.2449073791503906, + "learning_rate": 0.0002, + "loss": 1.0781, + "step": 7590 + }, + { + "epoch": 5.898331393092743, + "grad_norm": 1.358058214187622, + "learning_rate": 0.0002, + "loss": 1.1191, + "step": 7600 + }, + { + "epoch": 5.906092355452076, + "grad_norm": 1.264655351638794, + "learning_rate": 0.0002, + "loss": 1.0779, + "step": 7610 + }, + { + "epoch": 5.913853317811409, + "grad_norm": 1.3186019659042358, + "learning_rate": 0.0002, + "loss": 1.1538, + "step": 7620 + }, + { + "epoch": 5.921614280170742, + "grad_norm": 1.4111460447311401, + "learning_rate": 0.0002, + "loss": 1.1076, + "step": 7630 + }, + { + "epoch": 5.929375242530074, + "grad_norm": 1.1078972816467285, + "learning_rate": 0.0002, + "loss": 1.1765, + "step": 7640 + }, + { + "epoch": 5.9371362048894065, + "grad_norm": 1.2742213010787964, + "learning_rate": 0.0002, + "loss": 1.1305, + "step": 7650 + }, + { + "epoch": 5.9448971672487385, + "grad_norm": 1.3412781953811646, + "learning_rate": 0.0002, + "loss": 1.144, + "step": 7660 + }, + { + "epoch": 5.952658129608071, + "grad_norm": 1.123005986213684, + "learning_rate": 0.0002, + "loss": 1.1642, + "step": 7670 + }, + { + "epoch": 5.960419091967404, + "grad_norm": 1.2203444242477417, + "learning_rate": 0.0002, + "loss": 1.0732, + "step": 7680 + }, + { + "epoch": 5.968180054326736, + "grad_norm": 1.341011643409729, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 7690 + }, + { + "epoch": 5.975941016686069, + "grad_norm": 1.2689454555511475, + "learning_rate": 0.0002, + "loss": 1.1144, + "step": 7700 + }, + { + "epoch": 5.983701979045401, + "grad_norm": 1.1518112421035767, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 7710 + }, + { + "epoch": 5.991462941404734, + "grad_norm": 1.3698320388793945, + "learning_rate": 0.0002, + "loss": 1.1868, + "step": 7720 + }, + { + "epoch": 5.999223903764067, + "grad_norm": 1.2812788486480713, + "learning_rate": 0.0002, + "loss": 1.0651, + "step": 7730 + }, + { + "epoch": 6.0, + "eval_loss": 2.252762794494629, + "eval_runtime": 114.8471, + "eval_samples_per_second": 4.415, + "eval_steps_per_second": 0.557, + "step": 7731 + }, + { + "epoch": 6.006984866123399, + "grad_norm": 1.8642009496688843, + "learning_rate": 0.0002, + "loss": 0.8629, + "step": 7740 + }, + { + "epoch": 6.014745828482732, + "grad_norm": 1.7081232070922852, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 7750 + }, + { + "epoch": 6.022506790842065, + "grad_norm": 1.6233899593353271, + "learning_rate": 0.0002, + "loss": 0.7729, + "step": 7760 + }, + { + "epoch": 6.030267753201397, + "grad_norm": 1.5111888647079468, + "learning_rate": 0.0002, + "loss": 0.7907, + "step": 7770 + }, + { + "epoch": 6.0380287155607295, + "grad_norm": 1.5278418064117432, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 7780 + }, + { + "epoch": 6.045789677920062, + "grad_norm": 1.5932185649871826, + "learning_rate": 0.0002, + "loss": 0.835, + "step": 7790 + }, + { + "epoch": 6.053550640279394, + "grad_norm": 1.5990597009658813, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 7800 + }, + { + "epoch": 6.061311602638727, + "grad_norm": 1.7498669624328613, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 7810 + }, + { + "epoch": 6.06907256499806, + "grad_norm": 1.6105555295944214, + "learning_rate": 0.0002, + "loss": 0.8069, + "step": 7820 + }, + { + "epoch": 6.076833527357392, + "grad_norm": 1.5214293003082275, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 7830 + }, + { + "epoch": 6.084594489716725, + "grad_norm": 1.6586973667144775, + "learning_rate": 0.0002, + "loss": 0.8328, + "step": 7840 + }, + { + "epoch": 6.092355452076057, + "grad_norm": 1.467391848564148, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 7850 + }, + { + "epoch": 6.10011641443539, + "grad_norm": 1.537361741065979, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 7860 + }, + { + "epoch": 6.107877376794723, + "grad_norm": 1.621764898300171, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 7870 + }, + { + "epoch": 6.115638339154055, + "grad_norm": 1.583751916885376, + "learning_rate": 0.0002, + "loss": 0.8556, + "step": 7880 + }, + { + "epoch": 6.123399301513388, + "grad_norm": 1.6199619770050049, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 7890 + }, + { + "epoch": 6.1311602638727205, + "grad_norm": 1.6163095235824585, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 7900 + }, + { + "epoch": 6.1389212262320525, + "grad_norm": 1.6120976209640503, + "learning_rate": 0.0002, + "loss": 0.8203, + "step": 7910 + }, + { + "epoch": 6.146682188591385, + "grad_norm": 1.7886850833892822, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 7920 + }, + { + "epoch": 6.154443150950718, + "grad_norm": 1.408303141593933, + "learning_rate": 0.0002, + "loss": 0.7808, + "step": 7930 + }, + { + "epoch": 6.16220411331005, + "grad_norm": 1.6048113107681274, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 7940 + }, + { + "epoch": 6.169965075669383, + "grad_norm": 1.424306869506836, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 7950 + }, + { + "epoch": 6.177726038028716, + "grad_norm": 1.4453672170639038, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 7960 + }, + { + "epoch": 6.185487000388048, + "grad_norm": 1.3157061338424683, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 7970 + }, + { + "epoch": 6.193247962747381, + "grad_norm": 1.330541729927063, + "learning_rate": 0.0002, + "loss": 0.891, + "step": 7980 + }, + { + "epoch": 6.201008925106713, + "grad_norm": 1.6306229829788208, + "learning_rate": 0.0002, + "loss": 0.8599, + "step": 7990 + }, + { + "epoch": 6.208769887466046, + "grad_norm": 1.6332136392593384, + "learning_rate": 0.0002, + "loss": 0.9069, + "step": 8000 + }, + { + "epoch": 6.216530849825379, + "grad_norm": 1.708613395690918, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 8010 + }, + { + "epoch": 6.224291812184711, + "grad_norm": 1.6637346744537354, + "learning_rate": 0.0002, + "loss": 0.8509, + "step": 8020 + }, + { + "epoch": 6.2320527745440435, + "grad_norm": 1.5675315856933594, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 8030 + }, + { + "epoch": 6.239813736903376, + "grad_norm": 1.5826327800750732, + "learning_rate": 0.0002, + "loss": 0.8491, + "step": 8040 + }, + { + "epoch": 6.247574699262708, + "grad_norm": 1.7382984161376953, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 8050 + }, + { + "epoch": 6.255335661622041, + "grad_norm": 1.5272295475006104, + "learning_rate": 0.0002, + "loss": 0.8795, + "step": 8060 + }, + { + "epoch": 6.263096623981374, + "grad_norm": 1.8195022344589233, + "learning_rate": 0.0002, + "loss": 0.8745, + "step": 8070 + }, + { + "epoch": 6.270857586340706, + "grad_norm": 1.679901361465454, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 8080 + }, + { + "epoch": 6.278618548700039, + "grad_norm": 1.4921348094940186, + "learning_rate": 0.0002, + "loss": 0.9006, + "step": 8090 + }, + { + "epoch": 6.286379511059371, + "grad_norm": 1.4627857208251953, + "learning_rate": 0.0002, + "loss": 0.899, + "step": 8100 + }, + { + "epoch": 6.294140473418704, + "grad_norm": 1.3528631925582886, + "learning_rate": 0.0002, + "loss": 0.8944, + "step": 8110 + }, + { + "epoch": 6.301901435778037, + "grad_norm": 1.6863102912902832, + "learning_rate": 0.0002, + "loss": 0.9355, + "step": 8120 + }, + { + "epoch": 6.309662398137369, + "grad_norm": 1.6178052425384521, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 8130 + }, + { + "epoch": 6.317423360496702, + "grad_norm": 1.7626280784606934, + "learning_rate": 0.0002, + "loss": 0.9182, + "step": 8140 + }, + { + "epoch": 6.3251843228560345, + "grad_norm": 1.7188845872879028, + "learning_rate": 0.0002, + "loss": 0.8886, + "step": 8150 + }, + { + "epoch": 6.3329452852153665, + "grad_norm": 1.5777133703231812, + "learning_rate": 0.0002, + "loss": 0.895, + "step": 8160 + }, + { + "epoch": 6.340706247574699, + "grad_norm": 1.7653207778930664, + "learning_rate": 0.0002, + "loss": 0.9247, + "step": 8170 + }, + { + "epoch": 6.348467209934032, + "grad_norm": 1.6861237287521362, + "learning_rate": 0.0002, + "loss": 0.8003, + "step": 8180 + }, + { + "epoch": 6.356228172293364, + "grad_norm": 1.6318124532699585, + "learning_rate": 0.0002, + "loss": 0.884, + "step": 8190 + }, + { + "epoch": 6.363989134652697, + "grad_norm": 1.6192939281463623, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 8200 + }, + { + "epoch": 6.371750097012029, + "grad_norm": 1.7641773223876953, + "learning_rate": 0.0002, + "loss": 0.8939, + "step": 8210 + }, + { + "epoch": 6.379511059371362, + "grad_norm": 1.6470493078231812, + "learning_rate": 0.0002, + "loss": 0.8582, + "step": 8220 + }, + { + "epoch": 6.387272021730695, + "grad_norm": 1.5898468494415283, + "learning_rate": 0.0002, + "loss": 0.9351, + "step": 8230 + }, + { + "epoch": 6.395032984090027, + "grad_norm": 1.8025981187820435, + "learning_rate": 0.0002, + "loss": 0.9658, + "step": 8240 + }, + { + "epoch": 6.40279394644936, + "grad_norm": 1.7035106420516968, + "learning_rate": 0.0002, + "loss": 0.8953, + "step": 8250 + }, + { + "epoch": 6.410554908808693, + "grad_norm": 1.5968799591064453, + "learning_rate": 0.0002, + "loss": 0.9193, + "step": 8260 + }, + { + "epoch": 6.418315871168025, + "grad_norm": 1.7492800951004028, + "learning_rate": 0.0002, + "loss": 0.929, + "step": 8270 + }, + { + "epoch": 6.4260768335273575, + "grad_norm": 1.6914138793945312, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 8280 + }, + { + "epoch": 6.43383779588669, + "grad_norm": 1.5761380195617676, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 8290 + }, + { + "epoch": 6.441598758246022, + "grad_norm": 1.5164411067962646, + "learning_rate": 0.0002, + "loss": 0.8761, + "step": 8300 + }, + { + "epoch": 6.449359720605355, + "grad_norm": 1.6600215435028076, + "learning_rate": 0.0002, + "loss": 0.88, + "step": 8310 + }, + { + "epoch": 6.457120682964687, + "grad_norm": 1.2477679252624512, + "learning_rate": 0.0002, + "loss": 0.9113, + "step": 8320 + }, + { + "epoch": 6.46488164532402, + "grad_norm": 1.3698599338531494, + "learning_rate": 0.0002, + "loss": 0.8822, + "step": 8330 + }, + { + "epoch": 6.472642607683353, + "grad_norm": 1.4847341775894165, + "learning_rate": 0.0002, + "loss": 0.9295, + "step": 8340 + }, + { + "epoch": 6.480403570042685, + "grad_norm": 1.4713412523269653, + "learning_rate": 0.0002, + "loss": 0.9243, + "step": 8350 + }, + { + "epoch": 6.488164532402018, + "grad_norm": 1.334523320198059, + "learning_rate": 0.0002, + "loss": 0.9102, + "step": 8360 + }, + { + "epoch": 6.495925494761351, + "grad_norm": 2.0054359436035156, + "learning_rate": 0.0002, + "loss": 0.8563, + "step": 8370 + }, + { + "epoch": 6.503686457120683, + "grad_norm": 1.560014247894287, + "learning_rate": 0.0002, + "loss": 0.9759, + "step": 8380 + }, + { + "epoch": 6.511447419480016, + "grad_norm": 1.518526554107666, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 8390 + }, + { + "epoch": 6.5192083818393485, + "grad_norm": 1.3841272592544556, + "learning_rate": 0.0002, + "loss": 0.937, + "step": 8400 + }, + { + "epoch": 6.5269693441986805, + "grad_norm": 1.5191527605056763, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 8410 + }, + { + "epoch": 6.534730306558013, + "grad_norm": 1.5275579690933228, + "learning_rate": 0.0002, + "loss": 0.8899, + "step": 8420 + }, + { + "epoch": 6.542491268917345, + "grad_norm": 1.621590256690979, + "learning_rate": 0.0002, + "loss": 0.9291, + "step": 8430 + }, + { + "epoch": 6.550252231276678, + "grad_norm": 1.7939082384109497, + "learning_rate": 0.0002, + "loss": 0.9011, + "step": 8440 + }, + { + "epoch": 6.558013193636011, + "grad_norm": 1.4542964696884155, + "learning_rate": 0.0002, + "loss": 0.8896, + "step": 8450 + }, + { + "epoch": 6.565774155995343, + "grad_norm": 1.5458455085754395, + "learning_rate": 0.0002, + "loss": 0.9393, + "step": 8460 + }, + { + "epoch": 6.573535118354676, + "grad_norm": 1.550359845161438, + "learning_rate": 0.0002, + "loss": 0.9028, + "step": 8470 + }, + { + "epoch": 6.581296080714009, + "grad_norm": 1.527757167816162, + "learning_rate": 0.0002, + "loss": 0.9271, + "step": 8480 + }, + { + "epoch": 6.589057043073341, + "grad_norm": 1.4683486223220825, + "learning_rate": 0.0002, + "loss": 0.966, + "step": 8490 + }, + { + "epoch": 6.596818005432674, + "grad_norm": 1.5057084560394287, + "learning_rate": 0.0002, + "loss": 0.9079, + "step": 8500 + }, + { + "epoch": 6.604578967792007, + "grad_norm": 1.648289442062378, + "learning_rate": 0.0002, + "loss": 0.9235, + "step": 8510 + }, + { + "epoch": 6.612339930151339, + "grad_norm": 1.578914761543274, + "learning_rate": 0.0002, + "loss": 0.9113, + "step": 8520 + }, + { + "epoch": 6.6201008925106715, + "grad_norm": 1.5064080953598022, + "learning_rate": 0.0002, + "loss": 0.8894, + "step": 8530 + }, + { + "epoch": 6.6278618548700035, + "grad_norm": 1.5717744827270508, + "learning_rate": 0.0002, + "loss": 0.8981, + "step": 8540 + }, + { + "epoch": 6.635622817229336, + "grad_norm": 1.7954767942428589, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 8550 + }, + { + "epoch": 6.643383779588669, + "grad_norm": 1.6172343492507935, + "learning_rate": 0.0002, + "loss": 0.927, + "step": 8560 + }, + { + "epoch": 6.651144741948001, + "grad_norm": 1.6627886295318604, + "learning_rate": 0.0002, + "loss": 0.9384, + "step": 8570 + }, + { + "epoch": 6.658905704307334, + "grad_norm": 1.5264919996261597, + "learning_rate": 0.0002, + "loss": 0.959, + "step": 8580 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.609248161315918, + "learning_rate": 0.0002, + "loss": 0.9103, + "step": 8590 + }, + { + "epoch": 6.674427629025999, + "grad_norm": 1.5474581718444824, + "learning_rate": 0.0002, + "loss": 0.9395, + "step": 8600 + }, + { + "epoch": 6.682188591385332, + "grad_norm": 1.6294898986816406, + "learning_rate": 0.0002, + "loss": 0.9018, + "step": 8610 + }, + { + "epoch": 6.689949553744665, + "grad_norm": 1.612615942955017, + "learning_rate": 0.0002, + "loss": 0.9323, + "step": 8620 + }, + { + "epoch": 6.697710516103997, + "grad_norm": 1.741325855255127, + "learning_rate": 0.0002, + "loss": 0.9218, + "step": 8630 + }, + { + "epoch": 6.70547147846333, + "grad_norm": 1.5089004039764404, + "learning_rate": 0.0002, + "loss": 1.0475, + "step": 8640 + }, + { + "epoch": 6.713232440822662, + "grad_norm": 1.4725582599639893, + "learning_rate": 0.0002, + "loss": 1.0009, + "step": 8650 + }, + { + "epoch": 6.7209934031819945, + "grad_norm": 1.6992095708847046, + "learning_rate": 0.0002, + "loss": 0.9818, + "step": 8660 + }, + { + "epoch": 6.728754365541327, + "grad_norm": 1.5938470363616943, + "learning_rate": 0.0002, + "loss": 0.9229, + "step": 8670 + }, + { + "epoch": 6.736515327900659, + "grad_norm": 1.58723783493042, + "learning_rate": 0.0002, + "loss": 0.9411, + "step": 8680 + }, + { + "epoch": 6.744276290259992, + "grad_norm": 1.514389991760254, + "learning_rate": 0.0002, + "loss": 0.9738, + "step": 8690 + }, + { + "epoch": 6.752037252619325, + "grad_norm": 1.6799157857894897, + "learning_rate": 0.0002, + "loss": 0.9283, + "step": 8700 + }, + { + "epoch": 6.759798214978657, + "grad_norm": 1.5436359643936157, + "learning_rate": 0.0002, + "loss": 0.9138, + "step": 8710 + }, + { + "epoch": 6.76755917733799, + "grad_norm": 1.477137565612793, + "learning_rate": 0.0002, + "loss": 0.9678, + "step": 8720 + }, + { + "epoch": 6.775320139697323, + "grad_norm": 1.7383503913879395, + "learning_rate": 0.0002, + "loss": 1.0044, + "step": 8730 + }, + { + "epoch": 6.783081102056655, + "grad_norm": 1.8000324964523315, + "learning_rate": 0.0002, + "loss": 0.9492, + "step": 8740 + }, + { + "epoch": 6.790842064415988, + "grad_norm": 1.3099453449249268, + "learning_rate": 0.0002, + "loss": 0.8943, + "step": 8750 + }, + { + "epoch": 6.79860302677532, + "grad_norm": 1.8775172233581543, + "learning_rate": 0.0002, + "loss": 0.9709, + "step": 8760 + }, + { + "epoch": 6.806363989134653, + "grad_norm": 1.5832085609436035, + "learning_rate": 0.0002, + "loss": 0.9356, + "step": 8770 + }, + { + "epoch": 6.8141249514939854, + "grad_norm": 1.4903252124786377, + "learning_rate": 0.0002, + "loss": 0.9397, + "step": 8780 + }, + { + "epoch": 6.821885913853317, + "grad_norm": 1.6360470056533813, + "learning_rate": 0.0002, + "loss": 0.9602, + "step": 8790 + }, + { + "epoch": 6.82964687621265, + "grad_norm": 1.5457707643508911, + "learning_rate": 0.0002, + "loss": 0.957, + "step": 8800 + }, + { + "epoch": 6.837407838571983, + "grad_norm": 1.5449066162109375, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 8810 + }, + { + "epoch": 6.845168800931315, + "grad_norm": 1.618337631225586, + "learning_rate": 0.0002, + "loss": 1.0007, + "step": 8820 + }, + { + "epoch": 6.852929763290648, + "grad_norm": 1.38296639919281, + "learning_rate": 0.0002, + "loss": 0.9314, + "step": 8830 + }, + { + "epoch": 6.860690725649981, + "grad_norm": 1.6427991390228271, + "learning_rate": 0.0002, + "loss": 0.9349, + "step": 8840 + }, + { + "epoch": 6.868451688009313, + "grad_norm": 1.4980270862579346, + "learning_rate": 0.0002, + "loss": 1.0194, + "step": 8850 + }, + { + "epoch": 6.876212650368646, + "grad_norm": 1.3800020217895508, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 8860 + }, + { + "epoch": 6.883973612727978, + "grad_norm": 1.5971838235855103, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 8870 + }, + { + "epoch": 6.891734575087311, + "grad_norm": 1.4429489374160767, + "learning_rate": 0.0002, + "loss": 1.0105, + "step": 8880 + }, + { + "epoch": 6.899495537446644, + "grad_norm": 1.4959166049957275, + "learning_rate": 0.0002, + "loss": 0.9143, + "step": 8890 + }, + { + "epoch": 6.907256499805976, + "grad_norm": 1.5776222944259644, + "learning_rate": 0.0002, + "loss": 0.9403, + "step": 8900 + }, + { + "epoch": 6.915017462165308, + "grad_norm": 1.510412573814392, + "learning_rate": 0.0002, + "loss": 0.9256, + "step": 8910 + }, + { + "epoch": 6.922778424524641, + "grad_norm": 1.7216295003890991, + "learning_rate": 0.0002, + "loss": 1.0095, + "step": 8920 + }, + { + "epoch": 6.930539386883973, + "grad_norm": 1.830762505531311, + "learning_rate": 0.0002, + "loss": 0.9464, + "step": 8930 + }, + { + "epoch": 6.938300349243306, + "grad_norm": 1.3472434282302856, + "learning_rate": 0.0002, + "loss": 0.9704, + "step": 8940 + }, + { + "epoch": 6.946061311602639, + "grad_norm": 1.5748040676116943, + "learning_rate": 0.0002, + "loss": 0.9718, + "step": 8950 + }, + { + "epoch": 6.953822273961971, + "grad_norm": 1.5317506790161133, + "learning_rate": 0.0002, + "loss": 0.9891, + "step": 8960 + }, + { + "epoch": 6.961583236321304, + "grad_norm": 1.5565721988677979, + "learning_rate": 0.0002, + "loss": 0.9513, + "step": 8970 + }, + { + "epoch": 6.969344198680636, + "grad_norm": 1.5288970470428467, + "learning_rate": 0.0002, + "loss": 0.9118, + "step": 8980 + }, + { + "epoch": 6.977105161039969, + "grad_norm": 1.562624454498291, + "learning_rate": 0.0002, + "loss": 0.9789, + "step": 8990 + }, + { + "epoch": 6.984866123399302, + "grad_norm": 1.3777633905410767, + "learning_rate": 0.0002, + "loss": 0.9929, + "step": 9000 + }, + { + "epoch": 6.992627085758635, + "grad_norm": 1.5868972539901733, + "learning_rate": 0.0002, + "loss": 0.9713, + "step": 9010 + }, + { + "epoch": 6.9996119518820334, + "eval_loss": 2.4372169971466064, + "eval_runtime": 113.8966, + "eval_samples_per_second": 4.451, + "eval_steps_per_second": 0.562, + "step": 9019 + } + ], + "logging_steps": 10, + "max_steps": 10304, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.632753036931891e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8ca6e2c3ac58fa2af9f99747566f932f41a5a4d5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-9019/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f7feb06ff53d5bf79374054a25b662309e705a2ca08dfa3b0bce7b8b4632fae +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8ca6e2c3ac58fa2af9f99747566f932f41a5a4d5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f7feb06ff53d5bf79374054a25b662309e705a2ca08dfa3b0bce7b8b4632fae +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/training_log.jsonl b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..14a0a3d0bef9090dc1477c17625532da9ecc79a5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/training_log.jsonl @@ -0,0 +1,10 @@ +{"epoch": 0.9996119518820333, "step": 1288, "epoch_duration": 4887.0092253685, "total_accumulated_duration": 4887.0092253685, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12786.0}, "peak_memory_reserved": {"GPU_0": 12786.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0855, "grad_norm": 1.0751162767410278, "learning_rate": 0.0002, "epoch": 0.007760962359332557, "step": 10}, {"loss": 2.4744, "grad_norm": 0.4697345793247223, "learning_rate": 0.0002, "epoch": 0.015521924718665115, "step": 20}, {"loss": 2.193, "grad_norm": 0.5370839238166809, "learning_rate": 0.0002, "epoch": 0.023282887077997673, "step": 30}, {"loss": 2.0599, "grad_norm": 0.46794816851615906, "learning_rate": 0.0002, "epoch": 0.03104384943733023, "step": 40}, {"loss": 1.9354, "grad_norm": 0.44624820351600647, "learning_rate": 0.0002, "epoch": 0.038804811796662786, "step": 50}, {"loss": 1.9319, "grad_norm": 0.3953201472759247, "learning_rate": 0.0002, "epoch": 0.046565774155995346, "step": 60}, {"loss": 1.9099, "grad_norm": 0.3935912549495697, "learning_rate": 0.0002, "epoch": 0.0543267365153279, "step": 70}, {"loss": 1.8795, "grad_norm": 0.4520699381828308, "learning_rate": 0.0002, "epoch": 0.06208769887466046, "step": 80}, {"loss": 1.8354, "grad_norm": 0.3801847994327545, "learning_rate": 0.0002, "epoch": 0.06984866123399301, "step": 90}, {"loss": 1.9053, "grad_norm": 0.4020165205001831, "learning_rate": 0.0002, "epoch": 0.07760962359332557, "step": 100}, {"loss": 1.8779, "grad_norm": 0.3860672116279602, "learning_rate": 0.0002, "epoch": 0.08537058595265813, "step": 110}, {"loss": 1.8731, "grad_norm": 0.3681113123893738, "learning_rate": 0.0002, "epoch": 0.09313154831199069, "step": 120}, {"loss": 1.8157, "grad_norm": 0.3594866991043091, "learning_rate": 0.0002, "epoch": 0.10089251067132324, "step": 130}, {"loss": 1.8266, "grad_norm": 0.3879193663597107, "learning_rate": 0.0002, "epoch": 0.1086534730306558, "step": 140}, {"loss": 1.8818, "grad_norm": 0.3270505666732788, "learning_rate": 0.0002, "epoch": 0.11641443538998836, "step": 150}, {"loss": 1.87, "grad_norm": 0.36824458837509155, "learning_rate": 0.0002, "epoch": 0.12417539774932092, "step": 160}, {"loss": 1.8305, "grad_norm": 0.383882075548172, "learning_rate": 0.0002, "epoch": 0.13193636010865348, "step": 170}, {"loss": 1.8584, "grad_norm": 0.3368665874004364, "learning_rate": 0.0002, "epoch": 0.13969732246798602, "step": 180}, {"loss": 1.7882, "grad_norm": 0.35961097478866577, "learning_rate": 0.0002, "epoch": 0.1474582848273186, "step": 190}, {"loss": 1.8467, "grad_norm": 0.3415963351726532, "learning_rate": 0.0002, "epoch": 0.15521924718665114, "step": 200}, {"loss": 1.8543, "grad_norm": 0.4100632071495056, "learning_rate": 0.0002, "epoch": 0.1629802095459837, "step": 210}, {"loss": 1.8226, "grad_norm": 0.3516307473182678, "learning_rate": 0.0002, "epoch": 0.17074117190531626, "step": 220}, {"loss": 1.7386, "grad_norm": 0.37919050455093384, "learning_rate": 0.0002, "epoch": 0.1785021342646488, "step": 230}, {"loss": 1.7937, "grad_norm": 0.33270683884620667, "learning_rate": 0.0002, "epoch": 0.18626309662398138, "step": 240}, {"loss": 1.7925, "grad_norm": 0.3348783254623413, "learning_rate": 0.0002, "epoch": 0.19402405898331393, "step": 250}, {"loss": 1.7774, "grad_norm": 0.3888475298881531, "learning_rate": 0.0002, "epoch": 0.20178502134264648, "step": 260}, {"loss": 1.8381, "grad_norm": 0.3554602861404419, "learning_rate": 0.0002, "epoch": 0.20954598370197905, "step": 270}, {"loss": 1.8359, "grad_norm": 0.33277708292007446, "learning_rate": 0.0002, "epoch": 0.2173069460613116, "step": 280}, {"loss": 1.7713, "grad_norm": 0.3281584680080414, "learning_rate": 0.0002, "epoch": 0.22506790842064417, "step": 290}, {"loss": 1.8181, "grad_norm": 0.3185969591140747, "learning_rate": 0.0002, "epoch": 0.23282887077997672, "step": 300}, {"loss": 1.8595, "grad_norm": 0.35335442423820496, "learning_rate": 0.0002, "epoch": 0.24058983313930926, "step": 310}, {"loss": 1.87, "grad_norm": 0.3119595944881439, "learning_rate": 0.0002, "epoch": 0.24835079549864184, "step": 320}, {"loss": 1.8357, "grad_norm": 0.36424458026885986, "learning_rate": 0.0002, "epoch": 0.2561117578579744, "step": 330}, {"loss": 1.8003, "grad_norm": 0.3618951141834259, "learning_rate": 0.0002, "epoch": 0.26387272021730696, "step": 340}, {"loss": 1.8221, "grad_norm": 0.312757670879364, "learning_rate": 0.0002, "epoch": 0.2716336825766395, "step": 350}, {"loss": 1.9031, "grad_norm": 0.326016366481781, "learning_rate": 0.0002, "epoch": 0.27939464493597205, "step": 360}, {"loss": 1.8214, "grad_norm": 0.34093883633613586, "learning_rate": 0.0002, "epoch": 0.2871556072953046, "step": 370}, {"loss": 1.7733, "grad_norm": 0.32325029373168945, "learning_rate": 0.0002, "epoch": 0.2949165696546372, "step": 380}, {"loss": 1.842, "grad_norm": 0.34105437994003296, "learning_rate": 0.0002, "epoch": 0.30267753201396974, "step": 390}, {"loss": 1.7926, "grad_norm": 0.32565295696258545, "learning_rate": 0.0002, "epoch": 0.3104384943733023, "step": 400}, {"loss": 1.8031, "grad_norm": 0.32742050290107727, "learning_rate": 0.0002, "epoch": 0.31819945673263483, "step": 410}, {"loss": 1.907, "grad_norm": 0.30233046412467957, "learning_rate": 0.0002, "epoch": 0.3259604190919674, "step": 420}, {"loss": 1.7623, "grad_norm": 0.32419222593307495, "learning_rate": 0.0002, "epoch": 0.3337213814513, "step": 430}, {"loss": 1.865, "grad_norm": 0.3653007745742798, "learning_rate": 0.0002, "epoch": 0.3414823438106325, "step": 440}, {"loss": 1.8044, "grad_norm": 0.31617099046707153, "learning_rate": 0.0002, "epoch": 0.3492433061699651, "step": 450}, {"loss": 1.7677, "grad_norm": 0.3305962085723877, "learning_rate": 0.0002, "epoch": 0.3570042685292976, "step": 460}, {"loss": 1.8155, "grad_norm": 0.3178933262825012, "learning_rate": 0.0002, "epoch": 0.36476523088863017, "step": 470}, {"loss": 1.7485, "grad_norm": 0.37163782119750977, "learning_rate": 0.0002, "epoch": 0.37252619324796277, "step": 480}, {"loss": 1.8804, "grad_norm": 0.469844788312912, "learning_rate": 0.0002, "epoch": 0.3802871556072953, "step": 490}, {"loss": 1.8343, "grad_norm": 0.3409338593482971, "learning_rate": 0.0002, "epoch": 0.38804811796662786, "step": 500}, {"loss": 1.8433, "grad_norm": 0.31943467259407043, "learning_rate": 0.0002, "epoch": 0.3958090803259604, "step": 510}, {"loss": 1.7873, "grad_norm": 0.32293614745140076, "learning_rate": 0.0002, "epoch": 0.40357004268529295, "step": 520}, {"loss": 1.8584, "grad_norm": 0.2994382977485657, "learning_rate": 0.0002, "epoch": 0.41133100504462555, "step": 530}, {"loss": 1.8153, "grad_norm": 0.3273141384124756, "learning_rate": 0.0002, "epoch": 0.4190919674039581, "step": 540}, {"loss": 1.8097, "grad_norm": 0.3020550012588501, "learning_rate": 0.0002, "epoch": 0.42685292976329064, "step": 550}, {"loss": 1.8551, "grad_norm": 0.30113112926483154, "learning_rate": 0.0002, "epoch": 0.4346138921226232, "step": 560}, {"loss": 1.8084, "grad_norm": 0.30274903774261475, "learning_rate": 0.0002, "epoch": 0.44237485448195574, "step": 570}, {"loss": 1.7673, "grad_norm": 0.3231128454208374, "learning_rate": 0.0002, "epoch": 0.45013581684128834, "step": 580}, {"loss": 1.7848, "grad_norm": 0.3255121409893036, "learning_rate": 0.0002, "epoch": 0.4578967792006209, "step": 590}, {"loss": 1.8227, "grad_norm": 0.30147507786750793, "learning_rate": 0.0002, "epoch": 0.46565774155995343, "step": 600}, {"loss": 1.7572, "grad_norm": 0.29781386256217957, "learning_rate": 0.0002, "epoch": 0.473418703919286, "step": 610}, {"loss": 1.8307, "grad_norm": 0.30914685130119324, "learning_rate": 0.0002, "epoch": 0.4811796662786185, "step": 620}, {"loss": 1.805, "grad_norm": 0.3110593855381012, "learning_rate": 0.0002, "epoch": 0.4889406286379511, "step": 630}, {"loss": 1.8228, "grad_norm": 0.3298132121562958, "learning_rate": 0.0002, "epoch": 0.49670159099728367, "step": 640}, {"loss": 1.7816, "grad_norm": 0.322122186422348, "learning_rate": 0.0002, "epoch": 0.5044625533566163, "step": 650}, {"loss": 1.8001, "grad_norm": 0.3504371643066406, "learning_rate": 0.0002, "epoch": 0.5122235157159488, "step": 660}, {"loss": 1.8682, "grad_norm": 0.3102182149887085, "learning_rate": 0.0002, "epoch": 0.5199844780752814, "step": 670}, {"loss": 1.7494, "grad_norm": 0.6113658547401428, "learning_rate": 0.0002, "epoch": 0.5277454404346139, "step": 680}, {"loss": 1.7096, "grad_norm": 0.31841862201690674, "learning_rate": 0.0002, "epoch": 0.5355064027939465, "step": 690}, {"loss": 1.7587, "grad_norm": 0.2830526530742645, "learning_rate": 0.0002, "epoch": 0.543267365153279, "step": 700}, {"loss": 1.7887, "grad_norm": 0.3048769533634186, "learning_rate": 0.0002, "epoch": 0.5510283275126115, "step": 710}, {"loss": 1.8416, "grad_norm": 0.2719033658504486, "learning_rate": 0.0002, "epoch": 0.5587892898719441, "step": 720}, {"loss": 1.786, "grad_norm": 0.3176722526550293, "learning_rate": 0.0002, "epoch": 0.5665502522312766, "step": 730}, {"loss": 1.7127, "grad_norm": 0.32491734623908997, "learning_rate": 0.0002, "epoch": 0.5743112145906092, "step": 740}, {"loss": 1.7892, "grad_norm": 0.32746851444244385, "learning_rate": 0.0002, "epoch": 0.5820721769499418, "step": 750}, {"loss": 1.7811, "grad_norm": 0.3055773973464966, "learning_rate": 0.0002, "epoch": 0.5898331393092744, "step": 760}, {"loss": 1.8597, "grad_norm": 0.30671584606170654, "learning_rate": 0.0002, "epoch": 0.5975941016686069, "step": 770}, {"loss": 1.7728, "grad_norm": 0.28770264983177185, "learning_rate": 0.0002, "epoch": 0.6053550640279395, "step": 780}, {"loss": 1.7025, "grad_norm": 0.2814285457134247, "learning_rate": 0.0002, "epoch": 0.613116026387272, "step": 790}, {"loss": 1.819, "grad_norm": 0.31554412841796875, "learning_rate": 0.0002, "epoch": 0.6208769887466046, "step": 800}, {"loss": 1.8335, "grad_norm": 0.2984226942062378, "learning_rate": 0.0002, "epoch": 0.6286379511059371, "step": 810}, {"loss": 1.7728, "grad_norm": 0.2859906554222107, "learning_rate": 0.0002, "epoch": 0.6363989134652697, "step": 820}, {"loss": 1.7408, "grad_norm": 0.2887928783893585, "learning_rate": 0.0002, "epoch": 0.6441598758246022, "step": 830}, {"loss": 1.8071, "grad_norm": 0.31287339329719543, "learning_rate": 0.0002, "epoch": 0.6519208381839348, "step": 840}, {"loss": 1.7673, "grad_norm": 0.32064181566238403, "learning_rate": 0.0002, "epoch": 0.6596818005432674, "step": 850}, {"loss": 1.7849, "grad_norm": 0.290981650352478, "learning_rate": 0.0002, "epoch": 0.6674427629026, "step": 860}, {"loss": 1.8089, "grad_norm": 0.33060121536254883, "learning_rate": 0.0002, "epoch": 0.6752037252619325, "step": 870}, {"loss": 1.7357, "grad_norm": 0.27032899856567383, "learning_rate": 0.0002, "epoch": 0.682964687621265, "step": 880}, {"loss": 1.8423, "grad_norm": 0.29031234979629517, "learning_rate": 0.0002, "epoch": 0.6907256499805976, "step": 890}, {"loss": 1.835, "grad_norm": 0.2845142185688019, "learning_rate": 0.0002, "epoch": 0.6984866123399301, "step": 900}, {"loss": 1.77, "grad_norm": 0.8638312816619873, "learning_rate": 0.0002, "epoch": 0.7062475746992627, "step": 910}, {"loss": 1.7757, "grad_norm": 0.3086668848991394, "learning_rate": 0.0002, "epoch": 0.7140085370585952, "step": 920}, {"loss": 1.7541, "grad_norm": 0.2724177837371826, "learning_rate": 0.0002, "epoch": 0.7217694994179278, "step": 930}, {"loss": 1.816, "grad_norm": 0.289559006690979, "learning_rate": 0.0002, "epoch": 0.7295304617772603, "step": 940}, {"loss": 1.7654, "grad_norm": 0.3000658452510834, "learning_rate": 0.0002, "epoch": 0.737291424136593, "step": 950}, {"loss": 1.7736, "grad_norm": 0.33544042706489563, "learning_rate": 0.0002, "epoch": 0.7450523864959255, "step": 960}, {"loss": 1.6979, "grad_norm": 0.28593236207962036, "learning_rate": 0.0002, "epoch": 0.7528133488552581, "step": 970}, {"loss": 1.8583, "grad_norm": 0.313634991645813, "learning_rate": 0.0002, "epoch": 0.7605743112145906, "step": 980}, {"loss": 1.7473, "grad_norm": 0.2949385941028595, "learning_rate": 0.0002, "epoch": 0.7683352735739232, "step": 990}, {"loss": 1.8689, "grad_norm": 0.2920108437538147, "learning_rate": 0.0002, "epoch": 0.7760962359332557, "step": 1000}, {"loss": 1.8401, "grad_norm": 0.3245100677013397, "learning_rate": 0.0002, "epoch": 0.7838571982925883, "step": 1010}, {"loss": 1.7109, "grad_norm": 0.3007619380950928, "learning_rate": 0.0002, "epoch": 0.7916181606519208, "step": 1020}, {"loss": 1.7427, "grad_norm": 0.3630852997303009, "learning_rate": 0.0002, "epoch": 0.7993791230112534, "step": 1030}, {"loss": 1.7655, "grad_norm": 0.2856379747390747, "learning_rate": 0.0002, "epoch": 0.8071400853705859, "step": 1040}, {"loss": 1.8371, "grad_norm": 0.32476478815078735, "learning_rate": 0.0002, "epoch": 0.8149010477299186, "step": 1050}, {"loss": 1.8039, "grad_norm": 0.5162565112113953, "learning_rate": 0.0002, "epoch": 0.8226620100892511, "step": 1060}, {"loss": 1.8862, "grad_norm": 0.316496342420578, "learning_rate": 0.0002, "epoch": 0.8304229724485837, "step": 1070}, {"loss": 1.8023, "grad_norm": 0.31977516412734985, "learning_rate": 0.0002, "epoch": 0.8381839348079162, "step": 1080}, {"loss": 1.8547, "grad_norm": 0.269509494304657, "learning_rate": 0.0002, "epoch": 0.8459448971672487, "step": 1090}, {"loss": 1.7811, "grad_norm": 0.31621453166007996, "learning_rate": 0.0002, "epoch": 0.8537058595265813, "step": 1100}, {"loss": 1.739, "grad_norm": 0.2946535050868988, "learning_rate": 0.0002, "epoch": 0.8614668218859138, "step": 1110}, {"loss": 1.7511, "grad_norm": 0.3088909983634949, "learning_rate": 0.0002, "epoch": 0.8692277842452464, "step": 1120}, {"loss": 1.8228, "grad_norm": 0.33033716678619385, "learning_rate": 0.0002, "epoch": 0.8769887466045789, "step": 1130}, {"loss": 1.7912, "grad_norm": 0.2954833507537842, "learning_rate": 0.0002, "epoch": 0.8847497089639115, "step": 1140}, {"loss": 1.8394, "grad_norm": 0.2950248122215271, "learning_rate": 0.0002, "epoch": 0.8925106713232441, "step": 1150}, {"loss": 1.7068, "grad_norm": 0.296661913394928, "learning_rate": 0.0002, "epoch": 0.9002716336825767, "step": 1160}, {"loss": 1.7967, "grad_norm": 0.35451310873031616, "learning_rate": 0.0002, "epoch": 0.9080325960419092, "step": 1170}, {"loss": 1.8202, "grad_norm": 0.32705947756767273, "learning_rate": 0.0002, "epoch": 0.9157935584012418, "step": 1180}, {"loss": 1.7396, "grad_norm": 0.3333960771560669, "learning_rate": 0.0002, "epoch": 0.9235545207605743, "step": 1190}, {"loss": 1.7801, "grad_norm": 0.3042232096195221, "learning_rate": 0.0002, "epoch": 0.9313154831199069, "step": 1200}, {"loss": 1.7586, "grad_norm": 0.281553715467453, "learning_rate": 0.0002, "epoch": 0.9390764454792394, "step": 1210}, {"loss": 1.7953, "grad_norm": 0.3096391558647156, "learning_rate": 0.0002, "epoch": 0.946837407838572, "step": 1220}, {"loss": 1.7401, "grad_norm": 0.2866271734237671, "learning_rate": 0.0002, "epoch": 0.9545983701979045, "step": 1230}, {"loss": 1.7211, "grad_norm": 0.28394097089767456, "learning_rate": 0.0002, "epoch": 0.962359332557237, "step": 1240}, {"loss": 1.7363, "grad_norm": 0.3249266743659973, "learning_rate": 0.0002, "epoch": 0.9701202949165697, "step": 1250}, {"loss": 1.7563, "grad_norm": 0.2896869480609894, "learning_rate": 0.0002, "epoch": 0.9778812572759022, "step": 1260}, {"loss": 1.6389, "grad_norm": 0.29224586486816406, "learning_rate": 0.0002, "epoch": 0.9856422196352348, "step": 1270}, {"loss": 1.7111, "grad_norm": 0.2820223569869995, "learning_rate": 0.0002, "epoch": 0.9934031819945673, "step": 1280}]} +{"epoch": 2.0, "step": 2577, "epoch_duration": 4024.7662012577057, "total_accumulated_duration": 8911.775426626205, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0855, "grad_norm": 1.0751162767410278, "learning_rate": 0.0002, "epoch": 0.007760962359332557, "step": 10}, {"loss": 2.4744, "grad_norm": 0.4697345793247223, "learning_rate": 0.0002, "epoch": 0.015521924718665115, "step": 20}, {"loss": 2.193, "grad_norm": 0.5370839238166809, "learning_rate": 0.0002, "epoch": 0.023282887077997673, "step": 30}, {"loss": 2.0599, "grad_norm": 0.46794816851615906, "learning_rate": 0.0002, "epoch": 0.03104384943733023, "step": 40}, {"loss": 1.9354, "grad_norm": 0.44624820351600647, "learning_rate": 0.0002, "epoch": 0.038804811796662786, "step": 50}, {"loss": 1.9319, "grad_norm": 0.3953201472759247, "learning_rate": 0.0002, "epoch": 0.046565774155995346, "step": 60}, {"loss": 1.9099, "grad_norm": 0.3935912549495697, "learning_rate": 0.0002, "epoch": 0.0543267365153279, "step": 70}, {"loss": 1.8795, "grad_norm": 0.4520699381828308, "learning_rate": 0.0002, "epoch": 0.06208769887466046, "step": 80}, {"loss": 1.8354, "grad_norm": 0.3801847994327545, "learning_rate": 0.0002, "epoch": 0.06984866123399301, "step": 90}, {"loss": 1.9053, "grad_norm": 0.4020165205001831, "learning_rate": 0.0002, "epoch": 0.07760962359332557, "step": 100}, {"loss": 1.8779, "grad_norm": 0.3860672116279602, "learning_rate": 0.0002, "epoch": 0.08537058595265813, "step": 110}, {"loss": 1.8731, "grad_norm": 0.3681113123893738, "learning_rate": 0.0002, "epoch": 0.09313154831199069, "step": 120}, {"loss": 1.8157, "grad_norm": 0.3594866991043091, "learning_rate": 0.0002, "epoch": 0.10089251067132324, "step": 130}, {"loss": 1.8266, "grad_norm": 0.3879193663597107, "learning_rate": 0.0002, "epoch": 0.1086534730306558, "step": 140}, {"loss": 1.8818, "grad_norm": 0.3270505666732788, "learning_rate": 0.0002, "epoch": 0.11641443538998836, "step": 150}, {"loss": 1.87, "grad_norm": 0.36824458837509155, "learning_rate": 0.0002, "epoch": 0.12417539774932092, "step": 160}, {"loss": 1.8305, "grad_norm": 0.383882075548172, "learning_rate": 0.0002, "epoch": 0.13193636010865348, "step": 170}, {"loss": 1.8584, "grad_norm": 0.3368665874004364, "learning_rate": 0.0002, "epoch": 0.13969732246798602, "step": 180}, {"loss": 1.7882, "grad_norm": 0.35961097478866577, "learning_rate": 0.0002, "epoch": 0.1474582848273186, "step": 190}, {"loss": 1.8467, "grad_norm": 0.3415963351726532, "learning_rate": 0.0002, "epoch": 0.15521924718665114, "step": 200}, {"loss": 1.8543, "grad_norm": 0.4100632071495056, "learning_rate": 0.0002, "epoch": 0.1629802095459837, "step": 210}, {"loss": 1.8226, "grad_norm": 0.3516307473182678, "learning_rate": 0.0002, "epoch": 0.17074117190531626, "step": 220}, {"loss": 1.7386, "grad_norm": 0.37919050455093384, "learning_rate": 0.0002, "epoch": 0.1785021342646488, "step": 230}, {"loss": 1.7937, "grad_norm": 0.33270683884620667, "learning_rate": 0.0002, "epoch": 0.18626309662398138, "step": 240}, {"loss": 1.7925, "grad_norm": 0.3348783254623413, "learning_rate": 0.0002, "epoch": 0.19402405898331393, "step": 250}, {"loss": 1.7774, "grad_norm": 0.3888475298881531, "learning_rate": 0.0002, "epoch": 0.20178502134264648, "step": 260}, {"loss": 1.8381, "grad_norm": 0.3554602861404419, "learning_rate": 0.0002, "epoch": 0.20954598370197905, "step": 270}, {"loss": 1.8359, "grad_norm": 0.33277708292007446, "learning_rate": 0.0002, "epoch": 0.2173069460613116, "step": 280}, {"loss": 1.7713, "grad_norm": 0.3281584680080414, "learning_rate": 0.0002, "epoch": 0.22506790842064417, "step": 290}, {"loss": 1.8181, "grad_norm": 0.3185969591140747, "learning_rate": 0.0002, "epoch": 0.23282887077997672, "step": 300}, {"loss": 1.8595, "grad_norm": 0.35335442423820496, "learning_rate": 0.0002, "epoch": 0.24058983313930926, "step": 310}, {"loss": 1.87, "grad_norm": 0.3119595944881439, "learning_rate": 0.0002, "epoch": 0.24835079549864184, "step": 320}, {"loss": 1.8357, "grad_norm": 0.36424458026885986, "learning_rate": 0.0002, "epoch": 0.2561117578579744, "step": 330}, {"loss": 1.8003, "grad_norm": 0.3618951141834259, "learning_rate": 0.0002, "epoch": 0.26387272021730696, "step": 340}, {"loss": 1.8221, "grad_norm": 0.312757670879364, "learning_rate": 0.0002, "epoch": 0.2716336825766395, "step": 350}, {"loss": 1.9031, "grad_norm": 0.326016366481781, "learning_rate": 0.0002, "epoch": 0.27939464493597205, "step": 360}, {"loss": 1.8214, "grad_norm": 0.34093883633613586, "learning_rate": 0.0002, "epoch": 0.2871556072953046, "step": 370}, {"loss": 1.7733, "grad_norm": 0.32325029373168945, "learning_rate": 0.0002, "epoch": 0.2949165696546372, "step": 380}, {"loss": 1.842, "grad_norm": 0.34105437994003296, "learning_rate": 0.0002, "epoch": 0.30267753201396974, "step": 390}, {"loss": 1.7926, "grad_norm": 0.32565295696258545, "learning_rate": 0.0002, "epoch": 0.3104384943733023, "step": 400}, {"loss": 1.8031, "grad_norm": 0.32742050290107727, "learning_rate": 0.0002, "epoch": 0.31819945673263483, "step": 410}, {"loss": 1.907, "grad_norm": 0.30233046412467957, "learning_rate": 0.0002, "epoch": 0.3259604190919674, "step": 420}, {"loss": 1.7623, "grad_norm": 0.32419222593307495, "learning_rate": 0.0002, "epoch": 0.3337213814513, "step": 430}, {"loss": 1.865, "grad_norm": 0.3653007745742798, "learning_rate": 0.0002, "epoch": 0.3414823438106325, "step": 440}, {"loss": 1.8044, "grad_norm": 0.31617099046707153, "learning_rate": 0.0002, "epoch": 0.3492433061699651, "step": 450}, {"loss": 1.7677, "grad_norm": 0.3305962085723877, "learning_rate": 0.0002, "epoch": 0.3570042685292976, "step": 460}, {"loss": 1.8155, "grad_norm": 0.3178933262825012, "learning_rate": 0.0002, "epoch": 0.36476523088863017, "step": 470}, {"loss": 1.7485, "grad_norm": 0.37163782119750977, "learning_rate": 0.0002, "epoch": 0.37252619324796277, "step": 480}, {"loss": 1.8804, "grad_norm": 0.469844788312912, "learning_rate": 0.0002, "epoch": 0.3802871556072953, "step": 490}, {"loss": 1.8343, "grad_norm": 0.3409338593482971, "learning_rate": 0.0002, "epoch": 0.38804811796662786, "step": 500}, {"loss": 1.8433, "grad_norm": 0.31943467259407043, "learning_rate": 0.0002, "epoch": 0.3958090803259604, "step": 510}, {"loss": 1.7873, "grad_norm": 0.32293614745140076, "learning_rate": 0.0002, "epoch": 0.40357004268529295, "step": 520}, {"loss": 1.8584, "grad_norm": 0.2994382977485657, "learning_rate": 0.0002, "epoch": 0.41133100504462555, "step": 530}, {"loss": 1.8153, "grad_norm": 0.3273141384124756, "learning_rate": 0.0002, "epoch": 0.4190919674039581, "step": 540}, {"loss": 1.8097, "grad_norm": 0.3020550012588501, "learning_rate": 0.0002, "epoch": 0.42685292976329064, "step": 550}, {"loss": 1.8551, "grad_norm": 0.30113112926483154, "learning_rate": 0.0002, "epoch": 0.4346138921226232, "step": 560}, {"loss": 1.8084, "grad_norm": 0.30274903774261475, "learning_rate": 0.0002, "epoch": 0.44237485448195574, "step": 570}, {"loss": 1.7673, "grad_norm": 0.3231128454208374, "learning_rate": 0.0002, "epoch": 0.45013581684128834, "step": 580}, {"loss": 1.7848, "grad_norm": 0.3255121409893036, "learning_rate": 0.0002, "epoch": 0.4578967792006209, "step": 590}, {"loss": 1.8227, "grad_norm": 0.30147507786750793, "learning_rate": 0.0002, "epoch": 0.46565774155995343, "step": 600}, {"loss": 1.7572, "grad_norm": 0.29781386256217957, "learning_rate": 0.0002, "epoch": 0.473418703919286, "step": 610}, {"loss": 1.8307, "grad_norm": 0.30914685130119324, "learning_rate": 0.0002, "epoch": 0.4811796662786185, "step": 620}, {"loss": 1.805, "grad_norm": 0.3110593855381012, "learning_rate": 0.0002, "epoch": 0.4889406286379511, "step": 630}, {"loss": 1.8228, "grad_norm": 0.3298132121562958, "learning_rate": 0.0002, "epoch": 0.49670159099728367, "step": 640}, {"loss": 1.7816, "grad_norm": 0.322122186422348, "learning_rate": 0.0002, "epoch": 0.5044625533566163, "step": 650}, {"loss": 1.8001, "grad_norm": 0.3504371643066406, "learning_rate": 0.0002, "epoch": 0.5122235157159488, "step": 660}, {"loss": 1.8682, "grad_norm": 0.3102182149887085, "learning_rate": 0.0002, "epoch": 0.5199844780752814, "step": 670}, {"loss": 1.7494, "grad_norm": 0.6113658547401428, "learning_rate": 0.0002, "epoch": 0.5277454404346139, "step": 680}, {"loss": 1.7096, "grad_norm": 0.31841862201690674, "learning_rate": 0.0002, "epoch": 0.5355064027939465, "step": 690}, {"loss": 1.7587, "grad_norm": 0.2830526530742645, "learning_rate": 0.0002, "epoch": 0.543267365153279, "step": 700}, {"loss": 1.7887, "grad_norm": 0.3048769533634186, "learning_rate": 0.0002, "epoch": 0.5510283275126115, "step": 710}, {"loss": 1.8416, "grad_norm": 0.2719033658504486, "learning_rate": 0.0002, "epoch": 0.5587892898719441, "step": 720}, {"loss": 1.786, "grad_norm": 0.3176722526550293, "learning_rate": 0.0002, "epoch": 0.5665502522312766, "step": 730}, {"loss": 1.7127, "grad_norm": 0.32491734623908997, "learning_rate": 0.0002, "epoch": 0.5743112145906092, "step": 740}, {"loss": 1.7892, "grad_norm": 0.32746851444244385, "learning_rate": 0.0002, "epoch": 0.5820721769499418, "step": 750}, {"loss": 1.7811, "grad_norm": 0.3055773973464966, "learning_rate": 0.0002, "epoch": 0.5898331393092744, "step": 760}, {"loss": 1.8597, "grad_norm": 0.30671584606170654, "learning_rate": 0.0002, "epoch": 0.5975941016686069, "step": 770}, {"loss": 1.7728, "grad_norm": 0.28770264983177185, "learning_rate": 0.0002, "epoch": 0.6053550640279395, "step": 780}, {"loss": 1.7025, "grad_norm": 0.2814285457134247, "learning_rate": 0.0002, "epoch": 0.613116026387272, "step": 790}, {"loss": 1.819, "grad_norm": 0.31554412841796875, "learning_rate": 0.0002, "epoch": 0.6208769887466046, "step": 800}, {"loss": 1.8335, "grad_norm": 0.2984226942062378, "learning_rate": 0.0002, "epoch": 0.6286379511059371, "step": 810}, {"loss": 1.7728, "grad_norm": 0.2859906554222107, "learning_rate": 0.0002, "epoch": 0.6363989134652697, "step": 820}, {"loss": 1.7408, "grad_norm": 0.2887928783893585, "learning_rate": 0.0002, "epoch": 0.6441598758246022, "step": 830}, {"loss": 1.8071, "grad_norm": 0.31287339329719543, "learning_rate": 0.0002, "epoch": 0.6519208381839348, "step": 840}, {"loss": 1.7673, "grad_norm": 0.32064181566238403, "learning_rate": 0.0002, "epoch": 0.6596818005432674, "step": 850}, {"loss": 1.7849, "grad_norm": 0.290981650352478, "learning_rate": 0.0002, "epoch": 0.6674427629026, "step": 860}, {"loss": 1.8089, "grad_norm": 0.33060121536254883, "learning_rate": 0.0002, "epoch": 0.6752037252619325, "step": 870}, {"loss": 1.7357, "grad_norm": 0.27032899856567383, "learning_rate": 0.0002, "epoch": 0.682964687621265, "step": 880}, {"loss": 1.8423, "grad_norm": 0.29031234979629517, "learning_rate": 0.0002, "epoch": 0.6907256499805976, "step": 890}, {"loss": 1.835, "grad_norm": 0.2845142185688019, "learning_rate": 0.0002, "epoch": 0.6984866123399301, "step": 900}, {"loss": 1.77, "grad_norm": 0.8638312816619873, "learning_rate": 0.0002, "epoch": 0.7062475746992627, "step": 910}, {"loss": 1.7757, "grad_norm": 0.3086668848991394, "learning_rate": 0.0002, "epoch": 0.7140085370585952, "step": 920}, {"loss": 1.7541, "grad_norm": 0.2724177837371826, "learning_rate": 0.0002, "epoch": 0.7217694994179278, "step": 930}, {"loss": 1.816, "grad_norm": 0.289559006690979, "learning_rate": 0.0002, "epoch": 0.7295304617772603, "step": 940}, {"loss": 1.7654, "grad_norm": 0.3000658452510834, "learning_rate": 0.0002, "epoch": 0.737291424136593, "step": 950}, {"loss": 1.7736, "grad_norm": 0.33544042706489563, "learning_rate": 0.0002, "epoch": 0.7450523864959255, "step": 960}, {"loss": 1.6979, "grad_norm": 0.28593236207962036, "learning_rate": 0.0002, "epoch": 0.7528133488552581, "step": 970}, {"loss": 1.8583, "grad_norm": 0.313634991645813, "learning_rate": 0.0002, "epoch": 0.7605743112145906, "step": 980}, {"loss": 1.7473, "grad_norm": 0.2949385941028595, "learning_rate": 0.0002, "epoch": 0.7683352735739232, "step": 990}, {"loss": 1.8689, "grad_norm": 0.2920108437538147, "learning_rate": 0.0002, "epoch": 0.7760962359332557, "step": 1000}, {"loss": 1.8401, "grad_norm": 0.3245100677013397, "learning_rate": 0.0002, "epoch": 0.7838571982925883, "step": 1010}, {"loss": 1.7109, "grad_norm": 0.3007619380950928, "learning_rate": 0.0002, "epoch": 0.7916181606519208, "step": 1020}, {"loss": 1.7427, "grad_norm": 0.3630852997303009, "learning_rate": 0.0002, "epoch": 0.7993791230112534, "step": 1030}, {"loss": 1.7655, "grad_norm": 0.2856379747390747, "learning_rate": 0.0002, "epoch": 0.8071400853705859, "step": 1040}, {"loss": 1.8371, "grad_norm": 0.32476478815078735, "learning_rate": 0.0002, "epoch": 0.8149010477299186, "step": 1050}, {"loss": 1.8039, "grad_norm": 0.5162565112113953, "learning_rate": 0.0002, "epoch": 0.8226620100892511, "step": 1060}, {"loss": 1.8862, "grad_norm": 0.316496342420578, "learning_rate": 0.0002, "epoch": 0.8304229724485837, "step": 1070}, {"loss": 1.8023, "grad_norm": 0.31977516412734985, "learning_rate": 0.0002, "epoch": 0.8381839348079162, "step": 1080}, {"loss": 1.8547, "grad_norm": 0.269509494304657, "learning_rate": 0.0002, "epoch": 0.8459448971672487, "step": 1090}, {"loss": 1.7811, "grad_norm": 0.31621453166007996, "learning_rate": 0.0002, "epoch": 0.8537058595265813, "step": 1100}, {"loss": 1.739, "grad_norm": 0.2946535050868988, "learning_rate": 0.0002, "epoch": 0.8614668218859138, "step": 1110}, {"loss": 1.7511, "grad_norm": 0.3088909983634949, "learning_rate": 0.0002, "epoch": 0.8692277842452464, "step": 1120}, {"loss": 1.8228, "grad_norm": 0.33033716678619385, "learning_rate": 0.0002, "epoch": 0.8769887466045789, "step": 1130}, {"loss": 1.7912, "grad_norm": 0.2954833507537842, "learning_rate": 0.0002, "epoch": 0.8847497089639115, "step": 1140}, {"loss": 1.8394, "grad_norm": 0.2950248122215271, "learning_rate": 0.0002, "epoch": 0.8925106713232441, "step": 1150}, {"loss": 1.7068, "grad_norm": 0.296661913394928, "learning_rate": 0.0002, "epoch": 0.9002716336825767, "step": 1160}, {"loss": 1.7967, "grad_norm": 0.35451310873031616, "learning_rate": 0.0002, "epoch": 0.9080325960419092, "step": 1170}, {"loss": 1.8202, "grad_norm": 0.32705947756767273, "learning_rate": 0.0002, "epoch": 0.9157935584012418, "step": 1180}, {"loss": 1.7396, "grad_norm": 0.3333960771560669, "learning_rate": 0.0002, "epoch": 0.9235545207605743, "step": 1190}, {"loss": 1.7801, "grad_norm": 0.3042232096195221, "learning_rate": 0.0002, "epoch": 0.9313154831199069, "step": 1200}, {"loss": 1.7586, "grad_norm": 0.281553715467453, "learning_rate": 0.0002, "epoch": 0.9390764454792394, "step": 1210}, {"loss": 1.7953, "grad_norm": 0.3096391558647156, "learning_rate": 0.0002, "epoch": 0.946837407838572, "step": 1220}, {"loss": 1.7401, "grad_norm": 0.2866271734237671, "learning_rate": 0.0002, "epoch": 0.9545983701979045, "step": 1230}, {"loss": 1.7211, "grad_norm": 0.28394097089767456, "learning_rate": 0.0002, "epoch": 0.962359332557237, "step": 1240}, {"loss": 1.7363, "grad_norm": 0.3249266743659973, "learning_rate": 0.0002, "epoch": 0.9701202949165697, "step": 1250}, {"loss": 1.7563, "grad_norm": 0.2896869480609894, "learning_rate": 0.0002, "epoch": 0.9778812572759022, "step": 1260}, {"loss": 1.6389, "grad_norm": 0.29224586486816406, "learning_rate": 0.0002, "epoch": 0.9856422196352348, "step": 1270}, {"loss": 1.7111, "grad_norm": 0.2820223569869995, "learning_rate": 0.0002, "epoch": 0.9934031819945673, "step": 1280}, {"eval_loss": 1.8081045150756836, "eval_runtime": 114.9794, "eval_samples_per_second": 4.409, "eval_steps_per_second": 0.557, "epoch": 0.9996119518820333, "step": 1288}, {"loss": 1.7518, "grad_norm": 0.3282551169395447, "learning_rate": 0.0002, "epoch": 1.0011641443538999, "step": 1290}, {"loss": 1.6806, "grad_norm": 0.30217495560646057, "learning_rate": 0.0002, "epoch": 1.0089251067132325, "step": 1300}, {"loss": 1.6777, "grad_norm": 0.30801767110824585, "learning_rate": 0.0002, "epoch": 1.016686069072565, "step": 1310}, {"loss": 1.7756, "grad_norm": 0.31816792488098145, "learning_rate": 0.0002, "epoch": 1.0244470314318976, "step": 1320}, {"loss": 1.6986, "grad_norm": 0.27794334292411804, "learning_rate": 0.0002, "epoch": 1.03220799379123, "step": 1330}, {"loss": 1.6931, "grad_norm": 0.3018926680088043, "learning_rate": 0.0002, "epoch": 1.0399689561505627, "step": 1340}, {"loss": 1.7033, "grad_norm": 0.3552975356578827, "learning_rate": 0.0002, "epoch": 1.0477299185098952, "step": 1350}, {"loss": 1.6782, "grad_norm": 0.32590144872665405, "learning_rate": 0.0002, "epoch": 1.0554908808692278, "step": 1360}, {"loss": 1.6479, "grad_norm": 0.3435460925102234, "learning_rate": 0.0002, "epoch": 1.0632518432285603, "step": 1370}, {"loss": 1.7451, "grad_norm": 0.35037797689437866, "learning_rate": 0.0002, "epoch": 1.071012805587893, "step": 1380}, {"loss": 1.7868, "grad_norm": 0.31398263573646545, "learning_rate": 0.0002, "epoch": 1.0787737679472253, "step": 1390}, {"loss": 1.6729, "grad_norm": 0.3134010434150696, "learning_rate": 0.0002, "epoch": 1.086534730306558, "step": 1400}, {"loss": 1.751, "grad_norm": 0.4599704444408417, "learning_rate": 0.0002, "epoch": 1.0942956926658907, "step": 1410}, {"loss": 1.6871, "grad_norm": 0.35852891206741333, "learning_rate": 0.0002, "epoch": 1.102056655025223, "step": 1420}, {"loss": 1.7083, "grad_norm": 0.35628634691238403, "learning_rate": 0.0002, "epoch": 1.1098176173845558, "step": 1430}, {"loss": 1.6166, "grad_norm": 0.3769161105155945, "learning_rate": 0.0002, "epoch": 1.1175785797438882, "step": 1440}, {"loss": 1.7344, "grad_norm": 1.3712416887283325, "learning_rate": 0.0002, "epoch": 1.1253395421032208, "step": 1450}, {"loss": 1.6542, "grad_norm": 0.38406670093536377, "learning_rate": 0.0002, "epoch": 1.1331005044625533, "step": 1460}, {"loss": 1.7104, "grad_norm": 0.3402116000652313, "learning_rate": 0.0002, "epoch": 1.140861466821886, "step": 1470}, {"loss": 1.7074, "grad_norm": 0.341189444065094, "learning_rate": 0.0002, "epoch": 1.1486224291812184, "step": 1480}, {"loss": 1.6468, "grad_norm": 0.36629995703697205, "learning_rate": 0.0002, "epoch": 1.156383391540551, "step": 1490}, {"loss": 1.6952, "grad_norm": 0.3499569296836853, "learning_rate": 0.0002, "epoch": 1.1641443538998835, "step": 1500}, {"loss": 1.6625, "grad_norm": 0.3663063943386078, "learning_rate": 0.0002, "epoch": 1.1719053162592161, "step": 1510}, {"loss": 1.7533, "grad_norm": 0.34851500391960144, "learning_rate": 0.0002, "epoch": 1.1796662786185488, "step": 1520}, {"loss": 1.6092, "grad_norm": 0.35071656107902527, "learning_rate": 0.0002, "epoch": 1.1874272409778812, "step": 1530}, {"loss": 1.7206, "grad_norm": 0.42783796787261963, "learning_rate": 0.0002, "epoch": 1.1951882033372139, "step": 1540}, {"loss": 1.7499, "grad_norm": 0.31830692291259766, "learning_rate": 0.0002, "epoch": 1.2029491656965463, "step": 1550}, {"loss": 1.7372, "grad_norm": 0.3597424626350403, "learning_rate": 0.0002, "epoch": 1.210710128055879, "step": 1560}, {"loss": 1.6386, "grad_norm": 0.35233765840530396, "learning_rate": 0.0002, "epoch": 1.2184710904152114, "step": 1570}, {"loss": 1.6766, "grad_norm": 0.35942912101745605, "learning_rate": 0.0002, "epoch": 1.226232052774544, "step": 1580}, {"loss": 1.6598, "grad_norm": 0.36159393191337585, "learning_rate": 0.0002, "epoch": 1.2339930151338767, "step": 1590}, {"loss": 1.6697, "grad_norm": 0.3328469693660736, "learning_rate": 0.0002, "epoch": 1.2417539774932091, "step": 1600}, {"loss": 1.7594, "grad_norm": 0.3089476525783539, "learning_rate": 0.0002, "epoch": 1.2495149398525418, "step": 1610}, {"loss": 1.6805, "grad_norm": 0.30947765707969666, "learning_rate": 0.0002, "epoch": 1.2572759022118742, "step": 1620}, {"loss": 1.6899, "grad_norm": 0.32154011726379395, "learning_rate": 0.0002, "epoch": 1.265036864571207, "step": 1630}, {"loss": 1.6621, "grad_norm": 0.3480297923088074, "learning_rate": 0.0002, "epoch": 1.2727978269305393, "step": 1640}, {"loss": 1.7087, "grad_norm": 0.39471694827079773, "learning_rate": 0.0002, "epoch": 1.280558789289872, "step": 1650}, {"loss": 1.7608, "grad_norm": 0.35728853940963745, "learning_rate": 0.0002, "epoch": 1.2883197516492044, "step": 1660}, {"loss": 1.7008, "grad_norm": 0.35223081707954407, "learning_rate": 0.0002, "epoch": 1.296080714008537, "step": 1670}, {"loss": 1.7253, "grad_norm": 0.3588867485523224, "learning_rate": 0.0002, "epoch": 1.3038416763678695, "step": 1680}, {"loss": 1.6505, "grad_norm": 0.3528042733669281, "learning_rate": 0.0002, "epoch": 1.3116026387272022, "step": 1690}, {"loss": 1.6945, "grad_norm": 0.35975801944732666, "learning_rate": 0.0002, "epoch": 1.3193636010865348, "step": 1700}, {"loss": 1.6631, "grad_norm": 0.36691880226135254, "learning_rate": 0.0002, "epoch": 1.3271245634458673, "step": 1710}, {"loss": 1.7593, "grad_norm": 0.3787977695465088, "learning_rate": 0.0002, "epoch": 1.3348855258052, "step": 1720}, {"loss": 1.7697, "grad_norm": 0.36614933609962463, "learning_rate": 0.0002, "epoch": 1.3426464881645324, "step": 1730}, {"loss": 1.6487, "grad_norm": 0.3484745919704437, "learning_rate": 0.0002, "epoch": 1.350407450523865, "step": 1740}, {"loss": 1.7054, "grad_norm": 0.36905673146247864, "learning_rate": 0.0002, "epoch": 1.3581684128831975, "step": 1750}, {"loss": 1.7679, "grad_norm": 0.41564738750457764, "learning_rate": 0.0002, "epoch": 1.36592937524253, "step": 1760}, {"loss": 1.6634, "grad_norm": 0.3345205783843994, "learning_rate": 0.0002, "epoch": 1.3736903376018628, "step": 1770}, {"loss": 1.7275, "grad_norm": 0.34926071763038635, "learning_rate": 0.0002, "epoch": 1.3814512999611952, "step": 1780}, {"loss": 1.685, "grad_norm": 0.42004233598709106, "learning_rate": 0.0002, "epoch": 1.3892122623205276, "step": 1790}, {"loss": 1.666, "grad_norm": 0.3576236963272095, "learning_rate": 0.0002, "epoch": 1.3969732246798603, "step": 1800}, {"loss": 1.8516, "grad_norm": 0.3586704432964325, "learning_rate": 0.0002, "epoch": 1.404734187039193, "step": 1810}, {"loss": 1.6171, "grad_norm": 0.3943439722061157, "learning_rate": 0.0002, "epoch": 1.4124951493985254, "step": 1820}, {"loss": 1.6865, "grad_norm": 0.3484877049922943, "learning_rate": 0.0002, "epoch": 1.420256111757858, "step": 1830}, {"loss": 1.7205, "grad_norm": 0.3344518840312958, "learning_rate": 0.0002, "epoch": 1.4280170741171905, "step": 1840}, {"loss": 1.6999, "grad_norm": 0.4345698356628418, "learning_rate": 0.0002, "epoch": 1.4357780364765231, "step": 1850}, {"loss": 1.6855, "grad_norm": 0.5525162220001221, "learning_rate": 0.0002, "epoch": 1.4435389988358556, "step": 1860}, {"loss": 1.7143, "grad_norm": 0.37194496393203735, "learning_rate": 0.0002, "epoch": 1.4512999611951882, "step": 1870}, {"loss": 1.7623, "grad_norm": 0.34570157527923584, "learning_rate": 0.0002, "epoch": 1.4590609235545209, "step": 1880}, {"loss": 1.7, "grad_norm": 0.3512282073497772, "learning_rate": 0.0002, "epoch": 1.4668218859138533, "step": 1890}, {"loss": 1.7225, "grad_norm": 0.3443922996520996, "learning_rate": 0.0002, "epoch": 1.4745828482731858, "step": 1900}, {"loss": 1.7393, "grad_norm": 0.3812018036842346, "learning_rate": 0.0002, "epoch": 1.4823438106325184, "step": 1910}, {"loss": 1.7277, "grad_norm": 0.39263492822647095, "learning_rate": 0.0002, "epoch": 1.490104772991851, "step": 1920}, {"loss": 1.6829, "grad_norm": 0.3146156072616577, "learning_rate": 0.0002, "epoch": 1.4978657353511835, "step": 1930}, {"loss": 1.6881, "grad_norm": 0.3653988540172577, "learning_rate": 0.0002, "epoch": 1.505626697710516, "step": 1940}, {"loss": 1.7064, "grad_norm": 0.3966596722602844, "learning_rate": 0.0002, "epoch": 1.5133876600698488, "step": 1950}, {"loss": 1.6942, "grad_norm": 0.3441697359085083, "learning_rate": 0.0002, "epoch": 1.5211486224291813, "step": 1960}, {"loss": 1.7175, "grad_norm": 0.3328564465045929, "learning_rate": 0.0002, "epoch": 1.5289095847885137, "step": 1970}, {"loss": 1.7394, "grad_norm": 0.34068772196769714, "learning_rate": 0.0002, "epoch": 1.5366705471478463, "step": 1980}, {"loss": 1.7016, "grad_norm": 0.3559795916080475, "learning_rate": 0.0002, "epoch": 1.544431509507179, "step": 1990}, {"loss": 1.7102, "grad_norm": 0.37888768315315247, "learning_rate": 0.0002, "epoch": 1.5521924718665114, "step": 2000}, {"loss": 1.7094, "grad_norm": 0.36128363013267517, "learning_rate": 0.0002, "epoch": 1.5599534342258439, "step": 2010}, {"loss": 1.6407, "grad_norm": 0.3643714487552643, "learning_rate": 0.0002, "epoch": 1.5677143965851765, "step": 2020}, {"loss": 1.6777, "grad_norm": 0.3863612115383148, "learning_rate": 0.0002, "epoch": 1.5754753589445092, "step": 2030}, {"loss": 1.6575, "grad_norm": 0.32831457257270813, "learning_rate": 0.0002, "epoch": 1.5832363213038416, "step": 2040}, {"loss": 1.7404, "grad_norm": 0.36098113656044006, "learning_rate": 0.0002, "epoch": 1.5909972836631743, "step": 2050}, {"loss": 1.7065, "grad_norm": 1.1079334020614624, "learning_rate": 0.0002, "epoch": 1.598758246022507, "step": 2060}, {"loss": 1.6824, "grad_norm": 0.35615381598472595, "learning_rate": 0.0002, "epoch": 1.6065192083818394, "step": 2070}, {"loss": 1.7262, "grad_norm": 0.369711309671402, "learning_rate": 0.0002, "epoch": 1.6142801707411718, "step": 2080}, {"loss": 1.6995, "grad_norm": 0.390658438205719, "learning_rate": 0.0002, "epoch": 1.6220411331005045, "step": 2090}, {"loss": 1.6996, "grad_norm": 0.3422999382019043, "learning_rate": 0.0002, "epoch": 1.6298020954598371, "step": 2100}, {"loss": 1.7135, "grad_norm": 0.372475266456604, "learning_rate": 0.0002, "epoch": 1.6375630578191696, "step": 2110}, {"loss": 1.7216, "grad_norm": 0.35660576820373535, "learning_rate": 0.0002, "epoch": 1.645324020178502, "step": 2120}, {"loss": 1.6991, "grad_norm": 0.35754942893981934, "learning_rate": 0.0002, "epoch": 1.6530849825378346, "step": 2130}, {"loss": 1.6779, "grad_norm": 0.34572410583496094, "learning_rate": 0.0002, "epoch": 1.6608459448971673, "step": 2140}, {"loss": 1.6707, "grad_norm": 0.42059701681137085, "learning_rate": 0.0002, "epoch": 1.6686069072564997, "step": 2150}, {"loss": 1.6782, "grad_norm": 0.35200759768486023, "learning_rate": 0.0002, "epoch": 1.6763678696158324, "step": 2160}, {"loss": 1.6869, "grad_norm": 0.3704029321670532, "learning_rate": 0.0002, "epoch": 1.684128831975165, "step": 2170}, {"loss": 1.7192, "grad_norm": 0.40450501441955566, "learning_rate": 0.0002, "epoch": 1.6918897943344975, "step": 2180}, {"loss": 1.6228, "grad_norm": 0.362966924905777, "learning_rate": 0.0002, "epoch": 1.69965075669383, "step": 2190}, {"loss": 1.6935, "grad_norm": 0.36586204171180725, "learning_rate": 0.0002, "epoch": 1.7074117190531626, "step": 2200}, {"loss": 1.6088, "grad_norm": 0.3295372426509857, "learning_rate": 0.0002, "epoch": 1.7151726814124952, "step": 2210}, {"loss": 1.7844, "grad_norm": 0.3892575800418854, "learning_rate": 0.0002, "epoch": 1.7229336437718277, "step": 2220}, {"loss": 1.7805, "grad_norm": 0.34712135791778564, "learning_rate": 0.0002, "epoch": 1.73069460613116, "step": 2230}, {"loss": 1.7353, "grad_norm": 0.34801796078681946, "learning_rate": 0.0002, "epoch": 1.738455568490493, "step": 2240}, {"loss": 1.7009, "grad_norm": 0.3822397291660309, "learning_rate": 0.0002, "epoch": 1.7462165308498254, "step": 2250}, {"loss": 1.6546, "grad_norm": 0.38933250308036804, "learning_rate": 0.0002, "epoch": 1.7539774932091579, "step": 2260}, {"loss": 1.7245, "grad_norm": 0.3798373341560364, "learning_rate": 0.0002, "epoch": 1.7617384555684905, "step": 2270}, {"loss": 1.6508, "grad_norm": 0.35151317715644836, "learning_rate": 0.0002, "epoch": 1.7694994179278232, "step": 2280}, {"loss": 1.6894, "grad_norm": 0.44981494545936584, "learning_rate": 0.0002, "epoch": 1.7772603802871556, "step": 2290}, {"loss": 1.7271, "grad_norm": 0.3992624580860138, "learning_rate": 0.0002, "epoch": 1.785021342646488, "step": 2300}, {"loss": 1.7252, "grad_norm": 0.3772512376308441, "learning_rate": 0.0002, "epoch": 1.7927823050058207, "step": 2310}, {"loss": 1.7057, "grad_norm": 0.3511589467525482, "learning_rate": 0.0002, "epoch": 1.8005432673651534, "step": 2320}, {"loss": 1.764, "grad_norm": 0.3805285394191742, "learning_rate": 0.0002, "epoch": 1.8083042297244858, "step": 2330}, {"loss": 1.6986, "grad_norm": 0.3792071044445038, "learning_rate": 0.0002, "epoch": 1.8160651920838184, "step": 2340}, {"loss": 1.7759, "grad_norm": 0.36430829763412476, "learning_rate": 0.0002, "epoch": 1.823826154443151, "step": 2350}, {"loss": 1.6773, "grad_norm": 0.36502477526664734, "learning_rate": 0.0002, "epoch": 1.8315871168024835, "step": 2360}, {"loss": 1.8072, "grad_norm": 0.35015153884887695, "learning_rate": 0.0002, "epoch": 1.839348079161816, "step": 2370}, {"loss": 1.7734, "grad_norm": 0.3710903823375702, "learning_rate": 0.0002, "epoch": 1.8471090415211486, "step": 2380}, {"loss": 1.6737, "grad_norm": 0.3542828857898712, "learning_rate": 0.0002, "epoch": 1.8548700038804813, "step": 2390}, {"loss": 1.6783, "grad_norm": 0.35467568039894104, "learning_rate": 0.0002, "epoch": 1.8626309662398137, "step": 2400}, {"loss": 1.7773, "grad_norm": 0.3638560473918915, "learning_rate": 0.0002, "epoch": 1.8703919285991462, "step": 2410}, {"loss": 1.7019, "grad_norm": 0.3823298215866089, "learning_rate": 0.0002, "epoch": 1.8781528909584788, "step": 2420}, {"loss": 1.6935, "grad_norm": 0.3926416337490082, "learning_rate": 0.0002, "epoch": 1.8859138533178115, "step": 2430}, {"loss": 1.71, "grad_norm": 0.3608079254627228, "learning_rate": 0.0002, "epoch": 1.893674815677144, "step": 2440}, {"loss": 1.6654, "grad_norm": 0.3426613509654999, "learning_rate": 0.0002, "epoch": 1.9014357780364766, "step": 2450}, {"loss": 1.6892, "grad_norm": 0.3522338569164276, "learning_rate": 0.0002, "epoch": 1.9091967403958092, "step": 2460}, {"loss": 1.7307, "grad_norm": 0.3608049154281616, "learning_rate": 0.0002, "epoch": 1.9169577027551417, "step": 2470}, {"loss": 1.6823, "grad_norm": 0.3849755525588989, "learning_rate": 0.0002, "epoch": 1.924718665114474, "step": 2480}, {"loss": 1.7518, "grad_norm": 0.4154011011123657, "learning_rate": 0.0002, "epoch": 1.9324796274738067, "step": 2490}, {"loss": 1.7381, "grad_norm": 0.3602796792984009, "learning_rate": 0.0002, "epoch": 1.9402405898331394, "step": 2500}, {"loss": 1.7843, "grad_norm": 0.3702992796897888, "learning_rate": 0.0002, "epoch": 1.9480015521924718, "step": 2510}, {"loss": 1.6669, "grad_norm": 0.3657735288143158, "learning_rate": 0.0002, "epoch": 1.9557625145518043, "step": 2520}, {"loss": 1.5964, "grad_norm": 0.41031739115715027, "learning_rate": 0.0002, "epoch": 1.963523476911137, "step": 2530}, {"loss": 1.6745, "grad_norm": 0.34578680992126465, "learning_rate": 0.0002, "epoch": 1.9712844392704696, "step": 2540}, {"loss": 1.723, "grad_norm": 0.3361521065235138, "learning_rate": 0.0002, "epoch": 1.979045401629802, "step": 2550}, {"loss": 1.6868, "grad_norm": 0.34342363476753235, "learning_rate": 0.0002, "epoch": 1.9868063639891347, "step": 2560}, {"loss": 1.6577, "grad_norm": 0.32954007387161255, "learning_rate": 0.0002, "epoch": 1.9945673263484673, "step": 2570}]} +{"epoch": 2.9996119518820334, "step": 3865, "epoch_duration": 2179.5441098213196, "total_accumulated_duration": 11091.319536447525, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0855, "grad_norm": 1.0751162767410278, "learning_rate": 0.0002, "epoch": 0.007760962359332557, "step": 10}, {"loss": 2.4744, "grad_norm": 0.4697345793247223, "learning_rate": 0.0002, "epoch": 0.015521924718665115, "step": 20}, {"loss": 2.193, "grad_norm": 0.5370839238166809, "learning_rate": 0.0002, "epoch": 0.023282887077997673, "step": 30}, {"loss": 2.0599, "grad_norm": 0.46794816851615906, "learning_rate": 0.0002, "epoch": 0.03104384943733023, "step": 40}, {"loss": 1.9354, "grad_norm": 0.44624820351600647, "learning_rate": 0.0002, "epoch": 0.038804811796662786, "step": 50}, {"loss": 1.9319, "grad_norm": 0.3953201472759247, "learning_rate": 0.0002, "epoch": 0.046565774155995346, "step": 60}, {"loss": 1.9099, "grad_norm": 0.3935912549495697, "learning_rate": 0.0002, "epoch": 0.0543267365153279, "step": 70}, {"loss": 1.8795, "grad_norm": 0.4520{"epoch": 0.9996119518820333, "step": 1288, "epoch_duration": 3340.132976293564, "total_accumulated_duration": 3340.132976293564, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12786.0}, "peak_memory_reserved": {"GPU_0": 12786.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0855, "grad_norm": 1.0751162767410278, "learning_rate": 0.0002, "epoch": 0.007760962359332557, "step": 10}, {"loss": 2.4744, "grad_norm": 0.4697345793247223, "learning_rate": 0.0002, "epoch": 0.015521924718665115, "step": 20}, {"loss": 2.193, "grad_norm": 0.5370839238166809, "learning_rate": 0.0002, "epoch": 0.023282887077997673, "step": 30}, {"loss": 2.0599, "grad_norm": 0.46794816851615906, "learning_rate": 0.0002, "epoch": 0.03104384943733023, "step": 40}, {"loss": 1.9354, "grad_norm": 0.44624820351600647, "learning_rate": 0.0002, "epoch": 0.038804811796662786, "step": 50}, {"loss": 1.9319, "grad_norm": 0.3953201472759247, "learning_rate": 0.0002, "epoch": 0.046565774155995346, "step": 60}, {"loss": 1.9099, "grad_norm": 0.3935912549495697, "learning_rate": 0.0002, "epoch": 0.0543267365153279, "step": 70}, {"loss": 1.8795, "grad_norm": 0.4520699381828308, "learning_rate": 0.0002, "epoch": 0.06208769887466046, "step": 80}, {"loss": 1.8354, "grad_norm": 0.3801847994327545, "learning_rate": 0.0002, "epoch": 0.06984866123399301, "step": 90}, {"loss": 1.9053, "grad_norm": 0.4020165205001831, "learning_rate": 0.0002, "epoch": 0.07760962359332557, "step": 100}, {"loss": 1.8779, "grad_norm": 0.3860672116279602, "learning_rate": 0.0002, "epoch": 0.08537058595265813, "step": 110}, {"loss": 1.8731, "grad_norm": 0.3681113123893738, "learning_rate": 0.0002, "epoch": 0.09313154831199069, "step": 120}, {"loss": 1.8157, "grad_norm": 0.3594866991043091, "learning_rate": 0.0002, "epoch": 0.10089251067132324, "step": 130}, {"loss": 1.8266, "grad_norm": 0.3879193663597107, "learning_rate": 0.0002, "epoch": 0.1086534730306558, "step": 140}, {"loss": 1.8818, "grad_norm": 0.3270505666732788, "learning_rate": 0.0002, "epoch": 0.11641443538998836, "step": 150}, {"loss": 1.87, "grad_norm": 0.36824458837509155, "learning_rate": 0.0002, "epoch": 0.12417539774932092, "step": 160}, {"loss": 1.8305, "grad_norm": 0.383882075548172, "learning_rate": 0.0002, "epoch": 0.13193636010865348, "step": 170}, {"loss": 1.8584, "grad_norm": 0.3368665874004364, "learning_rate": 0.0002, "epoch": 0.13969732246798602, "step": 180}, {"loss": 1.7882, "grad_norm": 0.35961097478866577, "learning_rate": 0.0002, "epoch": 0.1474582848273186, "step": 190}, {"loss": 1.8467, "grad_norm": 0.3415963351726532, "learning_rate": 0.0002, "epoch": 0.15521924718665114, "step": 200}, {"loss": 1.8543, "grad_norm": 0.4100632071495056, "learning_rate": 0.0002, "epoch": 0.1629802095459837, "step": 210}, {"loss": 1.8226, "grad_norm": 0.3516307473182678, "learning_rate": 0.0002, "epoch": 0.17074117190531626, "step": 220}, {"loss": 1.7386, "grad_norm": 0.37919050455093384, "learning_rate": 0.0002, "epoch": 0.1785021342646488, "step": 230}, {"loss": 1.7937, "grad_norm": 0.33270683884620667, "learning_rate": 0.0002, "epoch": 0.18626309662398138, "step": 240}, {"loss": 1.7925, "grad_norm": 0.3348783254623413, "learning_rate": 0.0002, "epoch": 0.19402405898331393, "step": 250}, {"loss": 1.7774, "grad_norm": 0.3888475298881531, "learning_rate": 0.0002, "epoch": 0.20178502134264648, "step": 260}, {"loss": 1.8381, "grad_norm": 0.3554602861404419, "learning_rate": 0.0002, "epoch": 0.20954598370197905, "step": 270}, {"loss": 1.8359, "grad_norm": 0.33277708292007446, "learning_rate": 0.0002, "epoch": 0.2173069460613116, "step": 280}, {"loss": 1.7713, "grad_norm": 0.3281584680080414, "learning_rate": 0.0002, "epoch": 0.22506790842064417, "step": 290}, {"loss": 1.8181, "grad_norm": 0.3185969591140747, "learning_rate": 0.0002, "epoch": 0.23282887077997672, "step": 300}, {"loss": 1.8595, "grad_norm": 0.35335442423820496, "learning_rate": 0.0002, "epoch": 0.24058983313930926, "step": 310}, {"loss": 1.87, "grad_norm": 0.3119595944881439, "learning_rate": 0.0002, "epoch": 0.24835079549864184, "step": 320}, {"loss": 1.8357, "grad_norm": 0.36424458026885986, "learning_rate": 0.0002, "epoch": 0.2561117578579744, "step": 330}, {"loss": 1.8003, "grad_norm": 0.3618951141834259, "learning_rate": 0.0002, "epoch": 0.26387272021730696, "step": 340}, {"loss": 1.8221, "grad_norm": 0.312757670879364, "learning_rate": 0.0002, "epoch": 0.2716336825766395, "step": 350}, {"loss": 1.9031, "grad_norm": 0.326016366481781, "learning_rate": 0.0002, "epoch": 0.27939464493597205, "step": 360}, {"loss": 1.8214, "grad_norm": 0.34093883633613586, "learning_rate": 0.0002, "epoch": 0.2871556072953046, "step": 370}, {"loss": 1.7733, "grad_norm": 0.32325029373168945, "learning_rate": 0.0002, "epoch": 0.2949165696546372, "step": 380}, {"loss": 1.842, "grad_norm": 0.34105437994003296, "learning_rate": 0.0002, "epoch": 0.30267753201396974, "step": 390}, {"loss": 1.7926, "grad_norm": 0.32565295696258545, "learning_rate": 0.0002, "epoch": 0.3104384943733023, "step": 400}, {"loss": 1.8031, "grad_norm": 0.32742050290107727, "learning_rate": 0.0002, "epoch": 0.31819945673263483, "step": 410}, {"loss": 1.907, "grad_norm": 0.30233046412467957, "learning_rate": 0.0002, "epoch": 0.3259604190919674, "step": 420}, {"loss": 1.7623, "grad_norm": 0.32419222593307495, "learning_rate": 0.0002, "epoch": 0.3337213814513, "step": 430}, {"loss": 1.865, "grad_norm": 0.3653007745742798, "learning_rate": 0.0002, "epoch": 0.3414823438106325, "step": 440}, {"loss": 1.8044, "grad_norm": 0.31617099046707153, "learning_rate": 0.0002, "epoch": 0.3492433061699651, "step": 450}, {"loss": 1.7677, "grad_norm": 0.3305962085723877, "learning_rate": 0.0002, "epoch": 0.3570042685292976, "step": 460}, {"loss": 1.8155, "grad_norm": 0.3178933262825012, "learning_rate": 0.0002, "epoch": 0.36476523088863017, "step": 470}, {"loss": 1.7485, "grad_norm": 0.37163782119750977, "learning_rate": 0.0002, "epoch": 0.37252619324796277, "step": 480}, {"loss": 1.8804, "grad_norm": 0.469844788312912, "learning_rate": 0.0002, "epoch": 0.3802871556072953, "step": 490}, {"loss": 1.8343, "grad_norm": 0.3409338593482971, "learning_rate": 0.0002, "epoch": 0.38804811796662786, "step": 500}, {"loss": 1.8433, "grad_norm": 0.31943467259407043, "learning_rate": 0.0002, "epoch": 0.3958090803259604, "step": 510}, {"loss": 1.7873, "grad_norm": 0.32293614745140076, "learning_rate": 0.0002, "epoch": 0.40357004268529295, "step": 520}, {"loss": 1.8584, "grad_norm": 0.2994382977485657, "learning_rate": 0.0002, "epoch": 0.41133100504462555, "step": 530}, {"loss": 1.8153, "grad_norm": 0.3273141384124756, "learning_rate": 0.0002, "epoch": 0.4190919674039581, "step": 540}, {"loss": 1.8097, "grad_norm": 0.3020550012588501, "learning_rate": 0.0002, "epoch": 0.42685292976329064, "step": 550}, {"loss": 1.8551, "grad_norm": 0.30113112926483154, "learning_rate": 0.0002, "epoch": 0.4346138921226232, "step": 560}, {"loss": 1.8084, "grad_norm": 0.30274903774261475, "learning_rate": 0.0002, "epoch": 0.44237485448195574, "step": 570}, {"loss": 1.7673, "grad_norm": 0.3231128454208374, "learning_rate": 0.0002, "epoch": 0.45013581684128834, "step": 580}, {"loss": 1.7848, "grad_norm": 0.3255121409893036, "learning_rate": 0.0002, "epoch": 0.4578967792006209, "step": 590}, {"loss": 1.8227, "grad_norm": 0.30147507786750793, "learning_rate": 0.0002, "epoch": 0.46565774155995343, "step": 600}, {"loss": 1.7572, "grad_norm": 0.29781386256217957, "learning_rate": 0.0002, "epoch": 0.473418703919286, "step": 610}, {"loss": 1.8307, "grad_norm": 0.30914685130119324, "learning_rate": 0.0002, "epoch": 0.4811796662786185, "step": 620}, {"loss": 1.805, "grad_norm": 0.3110593855381012, "learning_rate": 0.0002, "epoch": 0.4889406286379511, "step": 630}, {"loss": 1.8228, "grad_norm": 0.3298132121562958, "learning_rate": 0.0002, "epoch": 0.49670159099728367, "step": 640}, {"loss": 1.7816, "grad_norm": 0.322122186422348, "learning_rate": 0.0002, "epoch": 0.5044625533566163, "step": 650}, {"loss": 1.8001, "grad_norm": 0.3504371643066406, "learning_rate": 0.0002, "epoch": 0.5122235157159488, "step": 660}, {"loss": 1.8682, "grad_norm": 0.3102182149887085, "learning_rate": 0.0002, "epoch": 0.5199844780752814, "step": 670}, {"loss": 1.7494, "grad_norm": 0.6113658547401428, "learning_rate": 0.0002, "epoch": 0.5277454404346139, "step": 680}, {"loss": 1.7096, "grad_norm": 0.31841862201690674, "learning_rate": 0.0002, "epoch": 0.5355064027939465, "step": 690}, {"loss": 1.7587, "grad_norm": 0.2830526530742645, "learning_rate": 0.0002, "epoch": 0.543267365153279, "step": 700}, {"loss": 1.7887, "grad_norm": 0.3048769533634186, "learning_rate": 0.0002, "epoch": 0.5510283275126115, "step": 710}, {"loss": 1.8416, "grad_norm": 0.2719033658504486, "learning_rate": 0.0002, "epoch": 0.5587892898719441, "step": 720}, {"loss": 1.786, "grad_norm": 0.3176722526550293, "learning_rate": 0.0002, "epoch": 0.5665502522312766, "step": 730}, {"loss": 1.7127, "grad_norm": 0.32491734623908997, "learning_rate": 0.0002, "epoch": 0.5743112145906092, "step": 740}, {"loss": 1.7892, "grad_norm": 0.32746851444244385, "learning_rate": 0.0002, "epoch": 0.5820721769499418, "step": 750}, {"loss": 1.7811, "grad_norm": 0.3055773973464966, "learning_rate": 0.0002, "epoch": 0.5898331393092744, "step": 760}, {"loss": 1.8597, "grad_norm": 0.30671584606170654, "learning_rate": 0.0002, "epoch": 0.5975941016686069, "step": 770}, {"loss": 1.7728, "grad_norm": 0.28770264983177185, "learning_rate": 0.0002, "epoch": 0.6053550640279395, "step": 780}, {"loss": 1.7025, "grad_norm": 0.2814285457134247, "learning_rate": 0.0002, "epoch": 0.613116026387272, "step": 790}, {"loss": 1.819, "grad_norm": 0.31554412841796875, "learning_rate": 0.0002, "epoch": 0.6208769887466046, "step": 800}, {"loss": 1.8335, "grad_norm": 0.2984226942062378, "learning_rate": 0.0002, "epoch": 0.6286379511059371, "step": 810}, {"loss": 1.7728, "grad_norm": 0.2859906554222107, "learning_rate": 0.0002, "epoch": 0.6363989134652697, "step": 820}, {"loss": 1.7408, "grad_norm": 0.2887928783893585, "learning_rate": 0.0002, "epoch": 0.6441598758246022, "step": 830}, {"loss": 1.8071, "grad_norm": 0.31287339329719543, "learning_rate": 0.0002, "epoch": 0.6519208381839348, "step": 840}, {"loss": 1.7673, "grad_norm": 0.32064181566238403, "learning_rate": 0.0002, "epoch": 0.6596818005432674, "step": 850}, {"loss": 1.7849, "grad_norm": 0.290981650352478, "learning_rate": 0.0002, "epoch": 0.6674427629026, "step": 860}, {"loss": 1.8089, "grad_norm": 0.33060121536254883, "learning_rate": 0.0002, "epoch": 0.6752037252619325, "step": 870}, {"loss": 1.7357, "grad_norm": 0.27032899856567383, "learning_rate": 0.0002, "epoch": 0.682964687621265, "step": 880}, {"loss": 1.8423, "grad_norm": 0.29031234979629517, "learning_rate": 0.0002, "epoch": 0.6907256499805976, "step": 890}, {"loss": 1.835, "grad_norm": 0.2845142185688019, "learning_rate": 0.0002, "epoch": 0.6984866123399301, "step": 900}, {"loss": 1.77, "grad_norm": 0.8638312816619873, "learning_rate": 0.0002, "epoch": 0.7062475746992627, "step": 910}, {"loss": 1.7757, "grad_norm": 0.3086668848991394, "learning_rate": 0.0002, "epoch": 0.7140085370585952, "step": 920}, {"loss": 1.7541, "grad_norm": 0.2724177837371826, "learning_rate": 0.0002, "epoch": 0.7217694994179278, "step": 930}, {"loss": 1.816, "grad_norm": 0.289559006690979, "learning_rate": 0.0002, "epoch": 0.7295304617772603, "step": 940}, {"loss": 1.7654, "grad_norm": 0.3000658452510834, "learning_rate": 0.0002, "epoch": 0.737291424136593, "step": 950}, {"loss": 1.7736, "grad_norm": 0.33544042706489563, "learning_rate": 0.0002, "epoch": 0.7450523864959255, "step": 960}, {"loss": 1.6979, "grad_norm": 0.28593236207962036, "learning_rate": 0.0002, "epoch": 0.7528133488552581, "step": 970}, {"loss": 1.8583, "grad_norm": 0.313634991645813, "learning_rate": 0.0002, "epoch": 0.7605743112145906, "step": 980}, {"loss": 1.7473, "grad_norm": 0.2949385941028595, "learning_rate": 0.0002, "epoch": 0.7683352735739232, "step": 990}, {"loss": 1.8689, "grad_norm": 0.2920108437538147, "learning_rate": 0.0002, "epoch": 0.7760962359332557, "step": 1000}, {"loss": 1.8401, "grad_norm": 0.3245100677013397, "learning_rate": 0.0002, "epoch": 0.7838571982925883, "step": 1010}, {"loss": 1.7109, "grad_norm": 0.3007619380950928, "learning_rate": 0.0002, "epoch": 0.7916181606519208, "step": 1020}, {"loss": 1.7427, "grad_norm": 0.3630852997303009, "learning_rate": 0.0002, "epoch": 0.7993791230112534, "step": 1030}, {"loss": 1.7655, "grad_norm": 0.2856379747390747, "learning_rate": 0.0002, "epoch": 0.8071400853705859, "step": 1040}, {"loss": 1.8371, "grad_norm": 0.32476478815078735, "learning_rate": 0.0002, "epoch": 0.8149010477299186, "step": 1050}, {"loss": 1.8039, "grad_norm": 0.5162565112113953, "learning_rate": 0.0002, "epoch": 0.8226620100892511, "step": 1060}, {"loss": 1.8862, "grad_norm": 0.316496342420578, "learning_rate": 0.0002, "epoch": 0.8304229724485837, "step": 1070}, {"loss": 1.8023, "grad_norm": 0.31977516412734985, "learning_rate": 0.0002, "epoch": 0.8381839348079162, "step": 1080}, {"loss": 1.8547, "grad_norm": 0.269509494304657, "learning_rate": 0.0002, "epoch": 0.8459448971672487, "step": 1090}, {"loss": 1.7811, "grad_norm": 0.31621453166007996, "learning_rate": 0.0002, "epoch": 0.8537058595265813, "step": 1100}, {"loss": 1.739, "grad_norm": 0.2946535050868988, "learning_rate": 0.0002, "epoch": 0.8614668218859138, "step": 1110}, {"loss": 1.7511, "grad_norm": 0.3088909983634949, "learning_rate": 0.0002, "epoch": 0.8692277842452464, "step": 1120}, {"loss": 1.8228, "grad_norm": 0.33033716678619385, "learning_rate": 0.0002, "epoch": 0.8769887466045789, "step": 1130}, {"loss": 1.7912, "grad_norm": 0.2954833507537842, "learning_rate": 0.0002, "epoch": 0.8847497089639115, "step": 1140}, {"loss": 1.8394, "grad_norm": 0.2950248122215271, "learning_rate": 0.0002, "epoch": 0.8925106713232441, "step": 1150}, {"loss": 1.7068, "grad_norm": 0.296661913394928, "learning_rate": 0.0002, "epoch": 0.9002716336825767, "step": 1160}, {"loss": 1.7967, "grad_norm": 0.35451310873031616, "learning_rate": 0.0002, "epoch": 0.9080325960419092, "step": 1170}, {"loss": 1.8202, "grad_norm": 0.32705947756767273, "learning_rate": 0.0002, "epoch": 0.9157935584012418, "step": 1180}, {"loss": 1.7396, "grad_norm": 0.3333960771560669, "learning_rate": 0.0002, "epoch": 0.9235545207605743, "step": 1190}, {"loss": 1.7801, "grad_norm": 0.3042232096195221, "learning_rate": 0.0002, "epoch": 0.9313154831199069, "step": 1200}, {"loss": 1.7586, "grad_norm": 0.281553715467453, "learning_rate": 0.0002, "epoch": 0.9390764454792394, "step": 1210}, {"loss": 1.7953, "grad_norm": 0.3096391558647156, "learning_rate": 0.0002, "epoch": 0.946837407838572, "step": 1220}, {"loss": 1.7401, "grad_norm": 0.2866271734237671, "learning_rate": 0.0002, "epoch": 0.9545983701979045, "step": 1230}, {"loss": 1.7211, "grad_norm": 0.28394097089767456, "learning_rate": 0.0002, "epoch": 0.962359332557237, "step": 1240}, {"loss": 1.7363, "grad_norm": 0.3249266743659973, "learning_rate": 0.0002, "epoch": 0.9701202949165697, "step": 1250}, {"loss": 1.7563, "grad_norm": 0.2896869480609894, "learning_rate": 0.0002, "epoch": 0.9778812572759022, "step": 1260}, {"loss": 1.6389, "grad_norm": 0.29224586486816406, "learning_rate": 0.0002, "epoch": 0.9856422196352348, "step": 1270}, {"loss": 1.7111, "grad_norm": 0.2820223569869995, "learning_rate": 0.0002, "epoch": 0.9934031819945673, "step": 1280}]} +{"epoch": 2.0, "step": 2577, "epoch_duration": 3629.9410111904144, "total_accumulated_duration": 6970.073987483978, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-1288", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0855, "grad_norm": 1.0751162767410278, "learning_rate": 0.0002, "epoch": 0.007760962359332557, "step": 10}, {"loss": 2.4744, "grad_norm": 0.4697345793247223, "learning_rate": 0.0002, "epoch": 0.015521924718665115, "step": 20}, {"loss": 2.193, "grad_norm": 0.5370839238166809, "learning_rate": 0.0002, "epoch": 0.023282887077997673, "step": 30}, {"loss": 2.0599, "grad_norm": 0.46794816851615906, "learning_rate": 0.0002, "epoch": 0.03104384943733023, "step": 40}, {"loss": 1.9354, "grad_norm": 0.44624820351600647, "learning_rate": 0.0002, "epoch": 0.038804811796662786, "step": 50}, {"loss": 1.9319, "grad_norm": 0.3953201472759247, "learning_rate": 0.0002, "epoch": 0.046565774155995346, "step": 60}, {"loss": 1.9099, "grad_norm": 0.3935912549495697, "learning_rate": 0.0002, "epoch": 0.0543267365153279, "step": 70}, {"loss": 1.8795, "grad_norm": 0.4520699381828308, "learning_rate": 0.0002, "epoch": 0.06208769887466046, "step": 80}, {"loss": 1.8354, "grad_norm": 0.3801847994327545, "learning_rate": 0.0002, "epoch": 0.06984866123399301, "step": 90}, {"loss": 1.9053, "grad_norm": 0.4020165205001831, "learning_rate": 0.0002, "epoch": 0.07760962359332557, "step": 100}, {"loss": 1.8779, "grad_norm": 0.3860672116279602, "learning_rate": 0.0002, "epoch": 0.08537058595265813, "step": 110}, {"loss": 1.8731, "grad_norm": 0.3681113123893738, "learning_rate": 0.0002, "epoch": 0.09313154831199069, "step": 120}, {"loss": 1.8157, "grad_norm": 0.3594866991043091, "learning_rate": 0.0002, "epoch": 0.10089251067132324, "step": 130}, {"loss": 1.8266, "grad_norm": 0.3879193663597107, "learning_rate": 0.0002, "epoch": 0.1086534730306558, "step": 140}, {"loss": 1.8818, "grad_norm": 0.3270505666732788, "learning_rate": 0.0002, "epoch": 0.11641443538998836, "step": 150}, {"loss": 1.87, "grad_norm": 0.36824458837509155, "learning_rate": 0.0002, "epoch": 0.12417539774932092, "step": 160}, {"loss": 1.8305, "grad_norm": 0.383882075548172, "learning_rate": 0.0002, "epoch": 0.13193636010865348, "step": 170}, {"loss": 1.8584, "grad_norm": 0.3368665874004364, "learning_rate": 0.0002, "epoch": 0.13969732246798602, "step": 180}, {"loss": 1.7882, "grad_norm": 0.35961097478866577, "learning_rate": 0.0002, "epoch": 0.1474582848273186, "step": 190}, {"loss": 1.8467, "grad_norm": 0.3415963351726532, "learning_rate": 0.0002, "epoch": 0.15521924718665114, "step": 200}, {"loss": 1.8543, "grad_norm": 0.4100632071495056, "learning_rate": 0.0002, "epoch": 0.1629802095459837, "step": 210}, {"loss": 1.8226, "grad_norm": 0.3516307473182678, "learning_rate": 0.0002, "epoch": 0.17074117190531626, "step": 220}, {"loss": 1.7386, "grad_norm": 0.37919050455093384, "learning_rate": 0.0002, "epoch": 0.1785021342646488, "step": 230}, {"loss": 1.7937, "grad_norm": 0.33270683884620667, "learning_rate": 0.0002, "epoch": 0.18626309662398138, "step": 240}, {"loss": 1.7925, "grad_norm": 0.3348783254623413, "learning_rate": 0.0002, "epoch": 0.19402405898331393, "step": 250}, {"loss": 1.7774, "grad_norm": 0.3888475298881531, "learning_rate": 0.0002, "epoch": 0.20178502134264648, "step": 260}, {"loss": 1.8381, "grad_norm": 0.3554602861404419, "learning_rate": 0.0002, "epoch": 0.20954598370197905, "step": 270}, {"loss": 1.8359, "grad_norm": 0.33277708292007446, "learning_rate": 0.0002, "epoch": 0.2173069460613116, "step": 280}, {"loss": 1.7713, "grad_norm": 0.3281584680080414, "learning_rate": 0.0002, "epoch": 0.22506790842064417, "step": 290}, {"loss": 1.8181, "grad_norm": 0.3185969591140747, "learning_rate": 0.0002, "epoch": 0.23282887077997672, "step": 300}, {"loss": 1.8595, "grad_norm": 0.35335442423820496, "learning_rate": 0.0002, "epoch": 0.24058983313930926, "step": 310}, {"loss": 1.87, "grad_norm": 0.3119595944881439, "learning_rate": 0.0002, "epoch": 0.24835079549864184, "step": 320}, {"loss": 1.8357, "grad_norm": 0.36424458026885986, "learning_rate": 0.0002, "epoch": 0.2561117578579744, "step": 330}, {"loss": 1.8003, "grad_norm": 0.3618951141834259, "learning_rate": 0.0002, "epoch": 0.26387272021730696, "step": 340}, {"loss": 1.8221, "grad_norm": 0.312757670879364, "learning_rate": 0.0002, "epoch": 0.2716336825766395, "step": 350}, {"loss": 1.9031, "grad_norm": 0.326016366481781, "learning_rate": 0.0002, "epoch": 0.27939464493597205, "step": 360}, {"loss": 1.8214, "grad_norm": 0.34093883633613586, "learning_rate": 0.0002, "epoch": 0.2871556072953046, "step": 370}, {"loss": 1.7733, "grad_norm": 0.32325029373168945, "learning_rate": 0.0002, "epoch": 0.2949165696546372, "step": 380}, {"loss": 1.842, "grad_norm": 0.34105437994003296, "learning_rate": 0.0002, "epoch": 0.30267753201396974, "step": 390}, {"loss": 1.7926, "grad_norm": 0.32565295696258545, "learning_rate": 0.0002, "epoch": 0.3104384943733023, "step": 400}, {"loss": 1.8031, "grad_norm": 0.32742050290107727, "learning_rate": 0.0002, "epoch": 0.31819945673263483, "step": 410}, {"loss": 1.907, "grad_norm": 0.30233046412467957, "learning_rate": 0.0002, "epoch": 0.3259604190919674, "step": 420}, {"loss": 1.7623, "grad_norm": 0.32419222593307495, "learning_rate": 0.0002, "epoch": 0.3337213814513, "step": 430}, {"loss": 1.865, "grad_norm": 0.3653007745742798, "learning_rate": 0.0002, "epoch": 0.3414823438106325, "step": 440}, {"loss": 1.8044, "grad_norm": 0.31617099046707153, "learning_rate": 0.0002, "epoch": 0.3492433061699651, "step": 450}, {"loss": 1.7677, "grad_norm": 0.3305962085723877, "learning_rate": 0.0002, "epoch": 0.3570042685292976, "step": 460}, {"loss": 1.8155, "grad_norm": 0.3178933262825012, "learning_rate": 0.0002, "epoch": 0.36476523088863017, "step": 470}, {"loss": 1.7485, "grad_norm": 0.37163782119750977, "learning_rate": 0.0002, "epoch": 0.37252619324796277, "step": 480}, {"loss": 1.8804, "grad_norm": 0.469844788312912, "learning_rate": 0.0002, "epoch": 0.3802871556072953, "step": 490}, {"loss": 1.8343, "grad_norm": 0.3409338593482971, "learning_rate": 0.0002, "epoch": 0.38804811796662786, "step": 500}, {"loss": 1.8433, "grad_norm": 0.31943467259407043, "learning_rate": 0.0002, "epoch": 0.3958090803259604, "step": 510}, {"loss": 1.7873, "grad_norm": 0.32293614745140076, "learning_rate": 0.0002, "epoch": 0.40357004268529295, "step": 520}, {"loss": 1.8584, "grad_norm": 0.2994382977485657, "learning_rate": 0.0002, "epoch": 0.41133100504462555, "step": 530}, {"loss": 1.8153, "grad_norm": 0.3273141384124756, "learning_rate": 0.0002, "epoch": 0.4190919674039581, "step": 540}, {"loss": 1.8097, "grad_norm": 0.3020550012588501, "learning_rate": 0.0002, "epoch": 0.42685292976329064, "step": 550}, {"loss": 1.8551, "grad_norm": 0.30113112926483154, "learning_rate": 0.0002, "epoch": 0.4346138921226232, "step": 560}, {"loss": 1.8084, "grad_norm": 0.30274903774261475, "learning_rate": 0.0002, "epoch": 0.44237485448195574, "step": 570}, {"loss": 1.7673, "grad_norm": 0.3231128454208374, "learning_rate": 0.0002, "epoch": 0.45013581684128834, "step": 580}, {"loss": 1.7848, "grad_norm": 0.3255121409893036, "learning_rate": 0.0002, "epoch": 0.4578967792006209, "step": 590}, {"loss": 1.8227, "grad_norm": 0.30147507786750793, "learning_rate": 0.0002, "epoch": 0.46565774155995343, "step": 600}, {"loss": 1.7572, "grad_norm": 0.29781386256217957, "learning_rate": 0.0002, "epoch": 0.473418703919286, "step": 610}, {"loss": 1.8307, "grad_norm": 0.30914685130119324, "learning_rate": 0.0002, "epoch": 0.4811796662786185, "step": 620}, {"loss": 1.805, "grad_norm": 0.3110593855381012, "learning_rate": 0.0002, "epoch": 0.4889406286379511, "step": 630}, {"loss": 1.8228, "grad_norm": 0.3298132121562958, "learning_rate": 0.0002, "epoch": 0.49670159099728367, "step": 640}, {"loss": 1.7816, "grad_norm": 0.322122186422348, "learning_rate": 0.0002, "epoch": 0.5044625533566163, "step": 650}, {"loss": 1.8001, "grad_norm": 0.3504371643066406, "learning_rate": 0.0002, "epoch": 0.5122235157159488, "step": 660}, {"loss": 1.8682, "grad_norm": 0.3102182149887085, "learning_rate": 0.0002, "epoch": 0.5199844780752814, "step": 670}, {"loss": 1.7494, "grad_norm": 0.6113658547401428, "learning_rate": 0.0002, "epoch": 0.5277454404346139, "step": 680}, {"loss": 1.7096, "grad_norm": 0.31841862201690674, "learning_rate": 0.0002, "epoch": 0.5355064027939465, "step": 690}, {"loss": 1.7587, "grad_norm": 0.2830526530742645, "learning_rate": 0.0002, "epoch": 0.543267365153279, "step": 700}, {"loss": 1.7887, "grad_norm": 0.3048769533634186, "learning_rate": 0.0002, "epoch": 0.5510283275126115, "step": 710}, {"loss": 1.8416, "grad_norm": 0.2719033658504486, "learning_rate": 0.0002, "epoch": 0.5587892898719441, "step": 720}, {"loss": 1.786, "grad_norm": 0.3176722526550293, "learning_rate": 0.0002, "epoch": 0.5665502522312766, "step": 730}, {"loss": 1.7127, "grad_norm": 0.32491734623908997, "learning_rate": 0.0002, "epoch": 0.5743112145906092, "step": 740}, {"loss": 1.7892, "grad_norm": 0.32746851444244385, "learning_rate": 0.0002, "epoch": 0.5820721769499418, "step": 750}, {"loss": 1.7811, "grad_norm": 0.3055773973464966, "learning_rate": 0.0002, "epoch": 0.5898331393092744, "step": 760}, {"loss": 1.8597, "grad_norm": 0.30671584606170654, "learning_rate": 0.0002, "epoch": 0.5975941016686069, "step": 770}, {"loss": 1.7728, "grad_norm": 0.28770264983177185, "learning_rate": 0.0002, "epoch": 0.6053550640279395, "step": 780}, {"loss": 1.7025, "grad_norm": 0.2814285457134247, "learning_rate": 0.0002, "epoch": 0.613116026387272, "step": 790}, {"loss": 1.819, "grad_norm": 0.31554412841796875, "learning_rate": 0.0002, "epoch": 0.6208769887466046, "step": 800}, {"loss": 1.8335, "grad_norm": 0.2984226942062378, "learning_rate": 0.0002, "epoch": 0.6286379511059371, "step": 810}, {"loss": 1.7728, "grad_norm": 0.2859906554222107, "learning_rate": 0.0002, "epoch": 0.6363989134652697, "step": 820}, {"loss": 1.7408, "grad_norm": 0.2887928783893585, "learning_rate": 0.0002, "epoch": 0.6441598758246022, "step": 830}, {"loss": 1.8071, "grad_norm": 0.31287339329719543, "learning_rate": 0.0002, "epoch": 0.6519208381839348, "step": 840}, {"loss": 1.7673, "grad_norm": 0.32064181566238403, "learning_rate": 0.0002, "epoch": 0.6596818005432674, "step": 850}, {"loss": 1.7849, "grad_norm": 0.290981650352478, "learning_rate": 0.0002, "epoch": 0.6674427629026, "step": 860}, {"loss": 1.8089, "grad_norm": 0.33060121536254883, "learning_rate": 0.0002, "epoch": 0.6752037252619325, "step": 870}, {"loss": 1.7357, "grad_norm": 0.27032899856567383, "learning_rate": 0.0002, "epoch": 0.682964687621265, "step": 880}, {"loss": 1.8423, "grad_norm": 0.29031234979629517, "learning_rate": 0.0002, "epoch": 0.6907256499805976, "step": 890}, {"loss": 1.835, "grad_norm": 0.2845142185688019, "learning_rate": 0.0002, "epoch": 0.6984866123399301, "step": 900}, {"loss": 1.77, "grad_norm": 0.8638312816619873, "learning_rate": 0.0002, "epoch": 0.7062475746992627, "step": 910}, {"loss": 1.7757, "grad_norm": 0.3086668848991394, "learning_rate": 0.0002, "epoch": 0.7140085370585952, "step": 920}, {"loss": 1.7541, "grad_norm": 0.2724177837371826, "learning_rate": 0.0002, "epoch": 0.7217694994179278, "step": 930}, {"loss": 1.816, "grad_norm": 0.289559006690979, "learning_rate": 0.0002, "epoch": 0.7295304617772603, "step": 940}, {"loss": 1.7654, "grad_norm": 0.3000658452510834, "learning_rate": 0.0002, "epoch": 0.737291424136593, "step": 950}, {"loss": 1.7736, "grad_norm": 0.33544042706489563, "learning_rate": 0.0002, "epoch": 0.7450523864959255, "step": 960}, {"loss": 1.6979, "grad_norm": 0.28593236207962036, "learning_rate": 0.0002, "epoch": 0.7528133488552581, "step": 970}, {"loss": 1.8583, "grad_norm": 0.313634991645813, "learning_rate": 0.0002, "epoch": 0.7605743112145906, "step": 980}, {"loss": 1.7473, "grad_norm": 0.2949385941028595, "learning_rate": 0.0002, "epoch": 0.7683352735739232, "step": 990}, {"loss": 1.8689, "grad_norm": 0.2920108437538147, "learning_rate": 0.0002, "epoch": 0.7760962359332557, "step": 1000}, {"loss": 1.8401, "grad_norm": 0.3245100677013397, "learning_rate": 0.0002, "epoch": 0.7838571982925883, "step": 1010}, {"loss": 1.7109, "grad_norm": 0.3007619380950928, "learning_rate": 0.0002, "epoch": 0.7916181606519208, "step": 1020}, {"loss": 1.7427, "grad_norm": 0.3630852997303009, "learning_rate": 0.0002, "epoch": 0.7993791230112534, "step": 1030}, {"loss": 1.7655, "grad_norm": 0.2856379747390747, "learning_rate": 0.0002, "epoch": 0.8071400853705859, "step": 1040}, {"loss": 1.8371, "grad_norm": 0.32476478815078735, "learning_rate": 0.0002, "epoch": 0.8149010477299186, "step": 1050}, {"loss": 1.8039, "grad_norm": 0.5162565112113953, "learning_rate": 0.0002, "epoch": 0.8226620100892511, "step": 1060}, {"loss": 1.8862, "grad_norm": 0.316496342420578, "learning_rate": 0.0002, "epoch": 0.8304229724485837, "step": 1070}, {"loss": 1.8023, "grad_norm": 0.31977516412734985, "learning_rate": 0.0002, "epoch": 0.8381839348079162, "step": 1080}, {"loss": 1.8547, "grad_norm": 0.269509494304657, "learning_rate": 0.0002, "epoch": 0.8459448971672487, "step": 1090}, {"loss": 1.7811, "grad_norm": 0.31621453166007996, "learning_rate": 0.0002, "epoch": 0.8537058595265813, "step": 1100}, {"loss": 1.739, "grad_norm": 0.2946535050868988, "learning_rate": 0.0002, "epoch": 0.8614668218859138, "step": 1110}, {"loss": 1.7511, "grad_norm": 0.3088909983634949, "learning_rate": 0.0002, "epoch": 0.8692277842452464, "step": 1120}, {"loss": 1.8228, "grad_norm": 0.33033716678619385, "learning_rate": 0.0002, "epoch": 0.8769887466045789, "step": 1130}, {"loss": 1.7912, "grad_norm": 0.2954833507537842, "learning_rate": 0.0002, "epoch": 0.8847497089639115, "step": 1140}, {"loss": 1.8394, "grad_norm": 0.2950248122215271, "learning_rate": 0.0002, "epoch": 0.8925106713232441, "step": 1150}, {"loss": 1.7068, "grad_norm": 0.296661913394928, "learning_rate": 0.0002, "epoch": 0.9002716336825767, "step": 1160}, {"loss": 1.7967, "grad_norm": 0.35451310873031616, "learning_rate": 0.0002, "epoch": 0.9080325960419092, "step": 1170}, {"loss": 1.8202, "grad_norm": 0.32705947756767273, "learning_rate": 0.0002, "epoch": 0.9157935584012418, "step": 1180}, {"loss": 1.7396, "grad_norm": 0.3333960771560669, "learning_rate": 0.0002, "epoch": 0.9235545207605743, "step": 1190}, {"loss": 1.7801, "grad_norm": 0.3042232096195221, "learning_rate": 0.0002, "epoch": 0.9313154831199069, "step": 1200}, {"loss": 1.7586, "grad_norm": 0.281553715467453, "learning_rate": 0.0002, "epoch": 0.9390764454792394, "step": 1210}, {"loss": 1.7953, "grad_norm": 0.3096391558647156, "learning_rate": 0.0002, "epoch": 0.946837407838572, "step": 1220}, {"loss": 1.7401, "grad_norm": 0.2866271734237671, "learning_rate": 0.0002, "epoch": 0.9545983701979045, "step": 1230}, {"loss": 1.7211, "grad_norm": 0.28394097089767456, "learning_rate": 0.0002, "epoch": 0.962359332557237, "step": 1240}, {"loss": 1.7363, "grad_norm": 0.3249266743659973, "learning_rate": 0.0002, "epoch": 0.9701202949165697, "step": 1250}, {"loss": 1.7563, "grad_norm": 0.2896869480609894, "learning_rate": 0.0002, "epoch": 0.9778812572759022, "step": 1260}, {"loss": 1.6389, "grad_norm": 0.29224586486816406, "learning_rate": 0.0002, "epoch": 0.9856422196352348, "step": 1270}, {"loss": 1.7111, "grad_norm": 0.2820223569869995, "learning_rate": 0.0002, "epoch": 0.9934031819945673, "step": 1280}, {"eval_loss": 1.8081045150756836, "eval_runtime": 102.3056, "eval_samples_per_second": 4.956, "eval_steps_per_second": 0.626, "epoch": 0.9996119518820333, "step": 1288}, {"loss": 1.7518, "grad_norm": 0.3282551169395447, "learning_rate": 0.0002, "epoch": 1.0011641443538999, "step": 1290}, {"loss": 1.6806, "grad_norm": 0.30217495560646057, "learning_rate": 0.0002, "epoch": 1.0089251067132325, "step": 1300}, {"loss": 1.6777, "grad_norm": 0.30801767110824585, "learning_rate": 0.0002, "epoch": 1.016686069072565, "step": 1310}, {"loss": 1.7756, "grad_norm": 0.31816792488098145, "learning_rate": 0.0002, "epoch": 1.0244470314318976, "step": 1320}, {"loss": 1.6986, "grad_norm": 0.27794334292411804, "learning_rate": 0.0002, "epoch": 1.03220799379123, "step": 1330}, {"loss": 1.6931, "grad_norm": 0.3018926680088043, "learning_rate": 0.0002, "epoch": 1.0399689561505627, "step": 1340}, {"loss": 1.7033, "grad_norm": 0.3552975356578827, "learning_rate": 0.0002, "epoch": 1.0477299185098952, "step": 1350}, {"loss": 1.6782, "grad_norm": 0.32590144872665405, "learning_rate": 0.0002, "epoch": 1.0554908808692278, "step": 1360}, {"loss": 1.6479, "grad_norm": 0.3435460925102234, "learning_rate": 0.0002, "epoch": 1.0632518432285603, "step": 1370}, {"loss": 1.7451, "grad_norm": 0.35037797689437866, "learning_rate": 0.0002, "epoch": 1.071012805587893, "step": 1380}, {"loss": 1.7868, "grad_norm": 0.31398263573646545, "learning_rate": 0.0002, "epoch": 1.0787737679472253, "step": 1390}, {"loss": 1.6729, "grad_norm": 0.3134010434150696, "learning_rate": 0.0002, "epoch": 1.086534730306558, "step": 1400}, {"loss": 1.751, "grad_norm": 0.4599704444408417, "learning_rate": 0.0002, "epoch": 1.0942956926658907, "step": 1410}, {"loss": 1.6871, "grad_norm": 0.35852891206741333, "learning_rate": 0.0002, "epoch": 1.102056655025223, "step": 1420}, {"loss": 1.7083, "grad_norm": 0.35628634691238403, "learning_rate": 0.0002, "epoch": 1.1098176173845558, "step": 1430}, {"loss": 1.6166, "grad_norm": 0.3769161105155945, "learning_rate": 0.0002, "epoch": 1.1175785797438882, "step": 1440}, {"loss": 1.7344, "grad_norm": 1.3712416887283325, "learning_rate": 0.0002, "epoch": 1.1253395421032208, "step": 1450}, {"loss": 1.6542, "grad_norm": 0.38406670093536377, "learning_rate": 0.0002, "epoch": 1.1331005044625533, "step": 1460}, {"loss": 1.7104, "grad_norm": 0.3402116000652313, "learning_rate": 0.0002, "epoch": 1.140861466821886, "step": 1470}, {"loss": 1.7074, "grad_norm": 0.341189444065094, "learning_rate": 0.0002, "epoch": 1.1486224291812184, "step": 1480}, {"loss": 1.6468, "grad_norm": 0.36629995703697205, "learning_rate": 0.0002, "epoch": 1.156383391540551, "step": 1490}, {"loss": 1.6952, "grad_norm": 0.3499569296836853, "learning_rate": 0.0002, "epoch": 1.1641443538998835, "step": 1500}, {"loss": 1.6625, "grad_norm": 0.3663063943386078, "learning_rate": 0.0002, "epoch": 1.1719053162592161, "step": 1510}, {"loss": 1.7533, "grad_norm": 0.34851500391960144, "learning_rate": 0.0002, "epoch": 1.1796662786185488, "step": 1520}, {"loss": 1.6092, "grad_norm": 0.35071656107902527, "learning_rate": 0.0002, "epoch": 1.1874272409778812, "step": 1530}, {"loss": 1.7206, "grad_norm": 0.42783796787261963, "learning_rate": 0.0002, "epoch": 1.1951882033372139, "step": 1540}, {"loss": 1.7499, "grad_norm": 0.31830692291259766, "learning_rate": 0.0002, "epoch": 1.2029491656965463, "step": 1550}, {"loss": 1.7372, "grad_norm": 0.3597424626350403, "learning_rate": 0.0002, "epoch": 1.210710128055879, "step": 1560}, {"loss": 1.6386, "grad_norm": 0.35233765840530396, "learning_rate": 0.0002, "epoch": 1.2184710904152114, "step": 1570}, {"loss": 1.6766, "grad_norm": 0.35942912101745605, "learning_rate": 0.0002, "epoch": 1.226232052774544, "step": 1580}, {"loss": 1.6598, "grad_norm": 0.36159393191337585, "learning_rate": 0.0002, "epoch": 1.2339930151338767, "step": 1590}, {"loss": 1.6697, "grad_norm": 0.3328469693660736, "learning_rate": 0.0002, "epoch": 1.2417539774932091, "step": 1600}, {"loss": 1.7594, "grad_norm": 0.3089476525783539, "learning_rate": 0.0002, "epoch": 1.2495149398525418, "step": 1610}, {"loss": 1.6805, "grad_norm": 0.30947765707969666, "learning_rate": 0.0002, "epoch": 1.2572759022118742, "step": 1620}, {"loss": 1.6899, "grad_norm": 0.32154011726379395, "learning_rate": 0.0002, "epoch": 1.265036864571207, "step": 1630}, {"loss": 1.6621, "grad_norm": 0.3480297923088074, "learning_rate": 0.0002, "epoch": 1.2727978269305393, "step": 1640}, {"loss": 1.7087, "grad_norm": 0.39471694827079773, "learning_rate": 0.0002, "epoch": 1.280558789289872, "step": 1650}, {"loss": 1.7608, "grad_norm": 0.35728853940963745, "learning_rate": 0.0002, "epoch": 1.2883197516492044, "step": 1660}, {"loss": 1.7008, "grad_norm": 0.35223081707954407, "learning_rate": 0.0002, "epoch": 1.296080714008537, "step": 1670}, {"loss": 1.7253, "grad_norm": 0.3588867485523224, "learning_rate": 0.0002, "epoch": 1.3038416763678695, "step": 1680}, {"loss": 1.6505, "grad_norm": 0.3528042733669281, "learning_rate": 0.0002, "epoch": 1.3116026387272022, "step": 1690}, {"loss": 1.6945, "grad_norm": 0.35975801944732666, "learning_rate": 0.0002, "epoch": 1.3193636010865348, "step": 1700}, {"loss": 1.6631, "grad_norm": 0.36691880226135254, "learning_rate": 0.0002, "epoch": 1.3271245634458673, "step": 1710}, {"loss": 1.7593, "grad_norm": 0.3787977695465088, "learning_rate": 0.0002, "epoch": 1.3348855258052, "step": 1720}, {"loss": 1.7697, "grad_norm": 0.36614933609962463, "learning_rate": 0.0002, "epoch": 1.3426464881645324, "step": 1730}, {"loss": 1.6487, "grad_norm": 0.3484745919704437, "learning_rate": 0.0002, "epoch": 1.350407450523865, "step": 1740}, {"loss": 1.7054, "grad_norm": 0.36905673146247864, "learning_rate": 0.0002, "epoch": 1.3581684128831975, "step": 1750}, {"loss": 1.7679, "grad_norm": 0.41564738750457764, "learning_rate": 0.0002, "epoch": 1.36592937524253, "step": 1760}, {"loss": 1.6634, "grad_norm": 0.3345205783843994, "learning_rate": 0.0002, "epoch": 1.3736903376018628, "step": 1770}, {"loss": 1.7275, "grad_norm": 0.34926071763038635, "learning_rate": 0.0002, "epoch": 1.3814512999611952, "step": 1780}, {"loss": 1.685, "grad_norm": 0.42004233598709106, "learning_rate": 0.0002, "epoch": 1.3892122623205276, "step": 1790}, {"loss": 1.666, "grad_norm": 0.3576236963272095, "learning_rate": 0.0002, "epoch": 1.3969732246798603, "step": 1800}, {"loss": 1.8516, "grad_norm": 0.3586704432964325, "learning_rate": 0.0002, "epoch": 1.404734187039193, "step": 1810}, {"loss": 1.6171, "grad_norm": 0.3943439722061157, "learning_rate": 0.0002, "epoch": 1.4124951493985254, "step": 1820}, {"loss": 1.6865, "grad_norm": 0.3484877049922943, "learning_rate": 0.0002, "epoch": 1.420256111757858, "step": 1830}, {"loss": 1.7205, "grad_norm": 0.3344518840312958, "learning_rate": 0.0002, "epoch": 1.4280170741171905, "step": 1840}, {"loss": 1.6999, "grad_norm": 0.4345698356628418, "learning_rate": 0.0002, "epoch": 1.4357780364765231, "step": 1850}, {"loss": 1.6855, "grad_norm": 0.5525162220001221, "learning_rate": 0.0002, "epoch": 1.4435389988358556, "step": 1860}, {"loss": 1.7143, "grad_norm": 0.37194496393203735, "learning_rate": 0.0002, "epoch": 1.4512999611951882, "step": 1870}, {"loss": 1.7623, "grad_norm": 0.34570157527923584, "learning_rate": 0.0002, "epoch": 1.4590609235545209, "step": 1880}, {"loss": 1.7, "grad_norm": 0.3512282073497772, "learning_rate": 0.0002, "epoch": 1.4668218859138533, "step": 1890}, {"loss": 1.7225, "grad_norm": 0.3443922996520996, "learning_rate": 0.0002, "epoch": 1.4745828482731858, "step": 1900}, {"loss": 1.7393, "grad_norm": 0.3812018036842346, "learning_rate": 0.0002, "epoch": 1.4823438106325184, "step": 1910}, {"loss": 1.7277, "grad_norm": 0.39263492822647095, "learning_rate": 0.0002, "epoch": 1.490104772991851, "step": 1920}, {"loss": 1.6829, "grad_norm": 0.3146156072616577, "learning_rate": 0.0002, "epoch": 1.4978657353511835, "step": 1930}, {"loss": 1.6881, "grad_norm": 0.3653988540172577, "learning_rate": 0.0002, "epoch": 1.505626697710516, "step": 1940}, {"loss": 1.7064, "grad_norm": 0.3966596722602844, "learning_rate": 0.0002, "epoch": 1.5133876600698488, "step": 1950}, {"loss": 1.6942, "grad_norm": 0.3441697359085083, "learning_rate": 0.0002, "epoch": 1.5211486224291813, "step": 1960}, {"loss": 1.7175, "grad_norm": 0.3328564465045929, "learning_rate": 0.0002, "epoch": 1.5289095847885137, "step": 1970}, {"loss": 1.7394, "grad_norm": 0.34068772196769714, "learning_rate": 0.0002, "epoch": 1.5366705471478463, "step": 1980}, {"loss": 1.7016, "grad_norm": 0.3559795916080475, "learning_rate": 0.0002, "epoch": 1.544431509507179, "step": 1990}, {"loss": 1.7102, "grad_norm": 0.37888768315315247, "learning_rate": 0.0002, "epoch": 1.5521924718665114, "step": 2000}, {"loss": 1.7094, "grad_norm": 0.36128363013267517, "learning_rate": 0.0002, "epoch": 1.5599534342258439, "step": 2010}, {"loss": 1.6407, "grad_norm": 0.3643714487552643, "learning_rate": 0.0002, "epoch": 1.5677143965851765, "step": 2020}, {"loss": 1.6777, "grad_norm": 0.3863612115383148, "learning_rate": 0.0002, "epoch": 1.5754753589445092, "step": 2030}, {"loss": 1.6575, "grad_norm": 0.32831457257270813, "learning_rate": 0.0002, "epoch": 1.5832363213038416, "step": 2040}, {"loss": 1.7404, "grad_norm": 0.36098113656044006, "learning_rate": 0.0002, "epoch": 1.5909972836631743, "step": 2050}, {"loss": 1.7065, "grad_norm": 1.1079334020614624, "learning_rate": 0.0002, "epoch": 1.598758246022507, "step": 2060}, {"loss": 1.6824, "grad_norm": 0.35615381598472595, "learning_rate": 0.0002, "epoch": 1.6065192083818394, "step": 2070}, {"loss": 1.7262, "grad_norm": 0.369711309671402, "learning_rate": 0.0002, "epoch": 1.6142801707411718, "step": 2080}, {"loss": 1.6995, "grad_norm": 0.390658438205719, "learning_rate": 0.0002, "epoch": 1.6220411331005045, "step": 2090}, {"loss": 1.6996, "grad_norm": 0.3422999382019043, "learning_rate": 0.0002, "epoch": 1.6298020954598371, "step": 2100}, {"loss": 1.7135, "grad_norm": 0.372475266456604, "learning_rate": 0.0002, "epoch": 1.6375630578191696, "step": 2110}, {"loss": 1.7216, "grad_norm": 0.35660576820373535, "learning_rate": 0.0002, "epoch": 1.645324020178502, "step": 2120}, {"loss": 1.6991, "grad_norm": 0.35754942893981934, "learning_rate": 0.0002, "epoch": 1.6530849825378346, "step": 2130}, {"loss": 1.6779, "grad_norm": 0.34572410583496094, "learning_rate": 0.0002, "epoch": 1.6608459448971673, "step": 2140}, {"loss": 1.6707, "grad_norm": 0.42059701681137085, "learning_rate": 0.0002, "epoch": 1.6686069072564997, "step": 2150}, {"loss": 1.6782, "grad_norm": 0.35200759768486023, "learning_rate": 0.0002, "epoch": 1.6763678696158324, "step": 2160}, {"loss": 1.6869, "grad_norm": 0.3704029321670532, "learning_rate": 0.0002, "epoch": 1.684128831975165, "step": 2170}, {"loss": 1.7192, "grad_norm": 0.40450501441955566, "learning_rate": 0.0002, "epoch": 1.6918897943344975, "step": 2180}, {"loss": 1.6228, "grad_norm": 0.362966924905777, "learning_rate": 0.0002, "epoch": 1.69965075669383, "step": 2190}, {"loss": 1.6935, "grad_norm": 0.36586204171180725, "learning_rate": 0.0002, "epoch": 1.7074117190531626, "step": 2200}, {"loss": 1.6088, "grad_norm": 0.3295372426509857, "learning_rate": 0.0002, "epoch": 1.7151726814124952, "step": 2210}, {"loss": 1.7844, "grad_norm": 0.3892575800418854, "learning_rate": 0.0002, "epoch": 1.7229336437718277, "step": 2220}, {"loss": 1.7805, "grad_norm": 0.34712135791778564, "learning_rate": 0.0002, "epoch": 1.73069460613116, "step": 2230}, {"loss": 1.7353, "grad_norm": 0.34801796078681946, "learning_rate": 0.0002, "epoch": 1.738455568490493, "step": 2240}, {"loss": 1.7009, "grad_norm": 0.3822397291660309, "learning_rate": 0.0002, "epoch": 1.7462165308498254, "step": 2250}, {"loss": 1.6546, "grad_norm": 0.38933250308036804, "learning_rate": 0.0002, "epoch": 1.7539774932091579, "step": 2260}, {"loss": 1.7245, "grad_norm": 0.3798373341560364, "learning_rate": 0.0002, "epoch": 1.7617384555684905, "step": 2270}, {"loss": 1.6508, "grad_norm": 0.35151317715644836, "learning_rate": 0.0002, "epoch": 1.7694994179278232, "step": 2280}, {"loss": 1.6894, "grad_norm": 0.44981494545936584, "learning_rate": 0.0002, "epoch": 1.7772603802871556, "step": 2290}, {"loss": 1.7271, "grad_norm": 0.3992624580860138, "learning_rate": 0.0002, "epoch": 1.785021342646488, "step": 2300}, {"loss": 1.7252, "grad_norm": 0.3772512376308441, "learning_rate": 0.0002, "epoch": 1.7927823050058207, "step": 2310}, {"loss": 1.7057, "grad_norm": 0.3511589467525482, "learning_rate": 0.0002, "epoch": 1.8005432673651534, "step": 2320}, {"loss": 1.764, "grad_norm": 0.3805285394191742, "learning_rate": 0.0002, "epoch": 1.8083042297244858, "step": 2330}, {"loss": 1.6986, "grad_norm": 0.3792071044445038, "learning_rate": 0.0002, "epoch": 1.8160651920838184, "step": 2340}, {"loss": 1.7759, "grad_norm": 0.36430829763412476, "learning_rate": 0.0002, "epoch": 1.823826154443151, "step": 2350}, {"loss": 1.6773, "grad_norm": 0.36502477526664734, "learning_rate": 0.0002, "epoch": 1.8315871168024835, "step": 2360}, {"loss": 1.8072, "grad_norm": 0.35015153884887695, "learning_rate": 0.0002, "epoch": 1.839348079161816, "step": 2370}, {"loss": 1.7734, "grad_norm": 0.3710903823375702, "learning_rate": 0.0002, "epoch": 1.8471090415211486, "step": 2380}, {"loss": 1.6737, "grad_norm": 0.3542828857898712, "learning_rate": 0.0002, "epoch": 1.8548700038804813, "step": 2390}, {"loss": 1.6783, "grad_norm": 0.35467568039894104, "learning_rate": 0.0002, "epoch": 1.8626309662398137, "step": 2400}, {"loss": 1.7773, "grad_norm": 0.3638560473918915, "learning_rate": 0.0002, "epoch": 1.8703919285991462, "step": 2410}, {"loss": 1.7019, "grad_norm": 0.3823298215866089, "learning_rate": 0.0002, "epoch": 1.8781528909584788, "step": 2420}, {"loss": 1.6935, "grad_norm": 0.3926416337490082, "learning_rate": 0.0002, "epoch": 1.8859138533178115, "step": 2430}, {"loss": 1.71, "grad_norm": 0.3608079254627228, "learning_rate": 0.0002, "epoch": 1.893674815677144, "step": 2440}, {"loss": 1.6654, "grad_norm": 0.3426613509654999, "learning_rate": 0.0002, "epoch": 1.9014357780364766, "step": 2450}, {"loss": 1.6892, "grad_norm": 0.3522338569164276, "learning_rate": 0.0002, "epoch": 1.9091967403958092, "step": 2460}, {"loss": 1.7307, "grad_norm": 0.3608049154281616, "learning_rate": 0.0002, "epoch": 1.9169577027551417, "step": 2470}, {"loss": 1.6823, "grad_norm": 0.3849755525588989, "learning_rate": 0.0002, "epoch": 1.924718665114474, "step": 2480}, {"loss": 1.7518, "grad_norm": 0.4154011011123657, "learning_rate": 0.0002, "epoch": 1.9324796274738067, "step": 2490}, {"loss": 1.7381, "grad_norm": 0.3602796792984009, "learning_rate": 0.0002, "epoch": 1.9402405898331394, "step": 2500}, {"loss": 1.7843, "grad_norm": 0.3702992796897888, "learning_rate": 0.0002, "epoch": 1.9480015521924718, "step": 2510}, {"loss": 1.6669, "grad_norm": 0.3657735288143158, "learning_rate": 0.0002, "epoch": 1.9557625145518043, "step": 2520}, {"loss": 1.5964, "grad_norm": 0.41031739115715027, "learning_rate": 0.0002, "epoch": 1.963523476911137, "step": 2530}, {"loss": 1.6745, "grad_norm": 0.34578680992126465, "learning_rate": 0.0002, "epoch": 1.9712844392704696, "step": 2540}, {"loss": 1.723, "grad_norm": 0.3361521065235138, "learning_rate": 0.0002, "epoch": 1.979045401629802, "step": 2550}, {"loss": 1.6868, "grad_norm": 0.34342363476753235, "learning_rate": 0.0002, "epoch": 1.9868063639891347, "step": 2560}, {"loss": 1.6577, "grad_norm": 0.32954007387161255, "learning_rate": 0.0002, "epoch": 1.9945673263484673, "step": 2570}]} +{"epoch": 2.9996119518820334, "step": 3865, "epoch_duration": 3652.9453053474426, "total_accumulated_duration": 10623.01929283142, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0855, "grad_norm": 1.0751162767410278, "learning_rate": 0.0002, "epoch": 0.007760962359332557, "step": 10}, {"loss": 2.4744, "grad_norm": 0.4697345793247223, "learning_rate": 0.0002, "epoch": 0.015521924718665115, "step": 20}, {"loss": 2.193, "grad_norm": 0.5370839238166809, "learning_rate": 0.0002, "epoch": 0.023282887077997673, "step": 30}, {"loss": 2.0599, "grad_norm": 0.46794816851615906, "learning_rate": 0.0002, "epoch": 0.03104384943733023, "step": 40}, {"loss": 1.9354, "grad_norm": 0.44624820351600647, "learning_rate": 0.0002, "epoch": 0.038804811796662786, "step": 50}, {"loss": 1.9319, "grad_norm": 0.3953201472759247, "learning_rate": 0.0002, "epoch": 0.046565774155995346, "step": 60}, {"loss": 1.9099, "grad_norm": 0.3935912549495697, "learning_rate": 0.0002, "epoch": 0.0543267365153279, "step": 70}, {"loss": 1.8795, "grad_norm": 0.4520699381828308, "learning_rate": 0.0002, "epoch": 0.06208769887466046, "step": 80}, {"loss": 1.8354, "grad_norm": 0.3801847994327545, "learning_rate": 0.0002, "epoch": 0.06984866123399301, "step": 90}, {"loss": 1.9053, "grad_norm": 0.4020165205001831, "learning_rate": 0.0002, "epoch": 0.07760962359332557, "step": 100}, {"loss": 1.8779, "grad_norm": 0.3860672116279602, "learning_rate": 0.0002, "epoch": 0.08537058595265813, "step": 110}, {"loss": 1.8731, "grad_norm": 0.3681113123893738, "learning_rate": 0.0002, "epoch": 0.09313154831199069, "step": 120}, {"loss": 1.8157, "grad_norm": 0.3594866991043091, "learning_rate": 0.0002, "epoch": 0.10089251067132324, "step": 130}, {"loss": 1.8266, "grad_norm": 0.3879193663597107, "learning_rate": 0.0002, "epoch": 0.1086534730306558, "step": 140}, {"loss": 1.8818, "grad_norm": 0.3270505666732788, "learning_rate": 0.0002, "epoch": 0.11641443538998836, "step": 150}, {"loss": 1.87, "grad_norm": 0.36824458837509155, "learning_rate": 0.0002, "epoch": 0.12417539774932092, "step": 160}, {"loss": 1.8305, "grad_norm": 0.383882075548172, "learning_rate": 0.0002, "epoch": 0.13193636010865348, "step": 170}, {"loss": 1.8584, "grad_norm": 0.3368665874004364, "learning_rate": 0.0002, "epoch": 0.13969732246798602, "step": 180}, {"loss": 1.7882, "grad_norm": 0.35961097478866577, "learning_rate": 0.0002, "epoch": 0.1474582848273186, "step": 190}, {"loss": 1.8467, "grad_norm": 0.3415963351726532, "learning_rate": 0.0002, "epoch": 0.15521924718665114, "step": 200}, {"loss": 1.8543, "grad_norm": 0.4100632071495056, "learning_rate": 0.0002, "epoch": 0.1629802095459837, "step": 210}, {"loss": 1.8226, "grad_norm": 0.3516307473182678, "learning_rate": 0.0002, "epoch": 0.17074117190531626, "step": 220}, {"loss": 1.7386, "grad_norm": 0.37919050455093384, "learning_rate": 0.0002, "epoch": 0.1785021342646488, "step": 230}, {"loss": 1.7937, "grad_norm": 0.33270683884620667, "learning_rate": 0.0002, "epoch": 0.18626309662398138, "step": 240}, {"loss": 1.7925, "grad_norm": 0.3348783254623413, "learning_rate": 0.0002, "epoch": 0.19402405898331393, "step": 250}, {"loss": 1.7774, "grad_norm": 0.3888475298881531, "learning_rate": 0.0002, "epoch": 0.20178502134264648, "step": 260}, {"loss": 1.8381, "grad_norm": 0.3554602861404419, "learning_rate": 0.0002, "epoch": 0.20954598370197905, "step": 270}, {"loss": 1.8359, "grad_norm": 0.33277708292007446, "learning_rate": 0.0002, "epoch": 0.2173069460613116, "step": 280}, {"loss": 1.7713, "grad_norm": 0.3281584680080414, "learning_rate": 0.0002, "epoch": 0.22506790842064417, "step": 290}, {"loss": 1.8181, "grad_norm": 0.3185969591140747, "learning_rate": 0.0002, "epoch": 0.23282887077997672, "step": 300}, {"loss": 1.8595, "grad_norm": 0.35335442423820496, "learning_rate": 0.0002, "epoch": 0.24058983313930926, "step": 310}, {"loss": 1.87, "grad_norm": 0.3119595944881439, "learning_rate": 0.0002, "epoch": 0.24835079549864184, "step": 320}, {"loss": 1.8357, "grad_norm": 0.36424458026885986, "learning_rate": 0.0002, "epoch": 0.2561117578579744, "step": 330}, {"loss": 1.8003, "grad_norm": 0.3618951141834259, "learning_rate": 0.0002, "epoch": 0.26387272021730696, "step": 340}, {"loss": 1.8221, "grad_norm": 0.312757670879364, "learning_rate": 0.0002, "epoch": 0.2716336825766395, "step": 350}, {"loss": 1.9031, "grad_norm": 0.326016366481781, "learning_rate": 0.0002, "epoch": 0.27939464493597205, "step": 360}, {"loss": 1.8214, "grad_norm": 0.34093883633613586, "learning_rate": 0.0002, "epoch": 0.2871556072953046, "step": 370}, {"loss": 1.7733, "grad_norm": 0.32325029373168945, "learning_rate": 0.0002, "epoch": 0.2949165696546372, "step": 380}, {"loss": 1.842, "grad_norm": 0.34105437994003296, "learning_rate": 0.0002, "epoch": 0.30267753201396974, "step": 390}, {"loss": 1.7926, "grad_norm": 0.32565295696258545, "learning_rate": 0.0002, "epoch": 0.3104384943733023, "step": 400}, {"loss": 1.8031, "grad_norm": 0.32742050290107727, "learning_rate": 0.0002, "epoch": 0.31819945673263483, "step": 410}, {"loss": 1.907, "grad_norm": 0.30233046412467957, "learning_rate": 0.0002, "epoch": 0.3259604190919674, "step": 420}, {"loss": 1.7623, "grad_norm": 0.32419222593307495, "learning_rate": 0.0002, "epoch": 0.3337213814513, "step": 430}, {"loss": 1.865, "grad_norm": 0.3653007745742798, "learning_rate": 0.0002, "epoch": 0.3414823438106325, "step": 440}, {"loss": 1.8044, "grad_norm": 0.31617099046707153, "learning_rate": 0.0002, "epoch": 0.3492433061699651, "step": 450}, {"loss": 1.7677, "grad_norm": 0.3305962085723877, "learning_rate": 0.0002, "epoch": 0.3570042685292976, "step": 460}, {"loss": 1.8155, "grad_norm": 0.3178933262825012, "learning_rate": 0.0002, "epoch": 0.36476523088863017, "step": 470}, {"loss": 1.7485, "grad_norm": 0.37163782119750977, "learning_rate": 0.0002, "epoch": 0.37252619324796277, "step": 480}, {"loss": 1.8804, "grad_norm": 0.469844788312912, "learning_rate": 0.0002, "epoch": 0.3802871556072953, "step": 490}, {"loss": 1.8343, "grad_norm": 0.3409338593482971, "learning_rate": 0.0002, "epoch": 0.38804811796662786, "step": 500}, {"loss": 1.8433, "grad_norm": 0.31943467259407043, "learning_rate": 0.0002, "epoch": 0.3958090803259604, "step": 510}, {"loss": 1.7873, "grad_norm": 0.32293614745140076, "learning_rate": 0.0002, "epoch": 0.40357004268529295, "step": 520}, {"loss": 1.8584, "grad_norm": 0.2994382977485657, "learning_rate": 0.0002, "epoch": 0.41133100504462555, "step": 530}, {"loss": 1.8153, "grad_norm": 0.3273141384124756, "learning_rate": 0.0002, "epoch": 0.4190919674039581, "step": 540}, {"loss": 1.8097, "grad_norm": 0.3020550012588501, "learning_rate": 0.0002, "epoch": 0.42685292976329064, "step": 550}, {"loss": 1.8551, "grad_norm": 0.30113112926483154, "learning_rate": 0.0002, "epoch": 0.4346138921226232, "step": 560}, {"loss": 1.8084, "grad_norm": 0.30274903774261475, "learning_rate": 0.0002, "epoch": 0.44237485448195574, "step": 570}, {"loss": 1.7673, "grad_norm": 0.3231128454208374, "learning_rate": 0.0002, "epoch": 0.45013581684128834, "step": 580}, {"loss": 1.7848, "grad_norm": 0.3255121409893036, "learning_rate": 0.0002, "epoch": 0.4578967792006209, "step": 590}, {"loss": 1.8227, "grad_norm": 0.30147507786750793, "learning_rate": 0.0002, "epoch": 0.46565774155995343, "step": 600}, {"loss": 1.7572, "grad_norm": 0.29781386256217957, "learning_rate": 0.0002, "epoch": 0.473418703919286, "step": 610}, {"loss": 1.8307, "grad_norm": 0.30914685130119324, "learning_rate": 0.0002, "epoch": 0.4811796662786185, "step": 620}, {"loss": 1.805, "grad_norm": 0.3110593855381012, "learning_rate": 0.0002, "epoch": 0.4889406286379511, "step": 630}, {"loss": 1.8228, "grad_norm": 0.3298132121562958, "learning_rate": 0.0002, "epoch": 0.49670159099728367, "step": 640}, {"loss": 1.7816, "grad_norm": 0.322122186422348, "learning_rate": 0.0002, "epoch": 0.5044625533566163, "step": 650}, {"loss": 1.8001, "grad_norm": 0.3504371643066406, "learning_rate": 0.0002, "epoch": 0.5122235157159488, "step": 660}, {"loss": 1.8682, "grad_norm": 0.3102182149887085, "learning_rate": 0.0002, "epoch": 0.5199844780752814, "step": 670}, {"loss": 1.7494, "grad_norm": 0.6113658547401428, "learning_rate": 0.0002, "epoch": 0.5277454404346139, "step": 680}, {"loss": 1.7096, "grad_norm": 0.31841862201690674, "learning_rate": 0.0002, "epoch": 0.5355064027939465, "step": 690}, {"loss": 1.7587, "grad_norm": 0.2830526530742645, "learning_rate": 0.0002, "epoch": 0.543267365153279, "step": 700}, {"loss": 1.7887, "grad_norm": 0.3048769533634186, "learning_rate": 0.0002, "epoch": 0.5510283275126115, "step": 710}, {"loss": 1.8416, "grad_norm": 0.2719033658504486, "learning_rate": 0.0002, "epoch": 0.5587892898719441, "step": 720}, {"loss": 1.786, "grad_norm": 0.3176722526550293, "learning_rate": 0.0002, "epoch": 0.5665502522312766, "step": 730}, {"loss": 1.7127, "grad_norm": 0.32491734623908997, "learning_rate": 0.0002, "epoch": 0.5743112145906092, "step": 740}, {"loss": 1.7892, "grad_norm": 0.32746851444244385, "learning_rate": 0.0002, "epoch": 0.5820721769499418, "step": 750}, {"loss": 1.7811, "grad_norm": 0.3055773973464966, "learning_rate": 0.0002, "epoch": 0.5898331393092744, "step": 760}, {"loss": 1.8597, "grad_norm": 0.30671584606170654, "learning_rate": 0.0002, "epoch": 0.5975941016686069, "step": 770}, {"loss": 1.7728, "grad_norm": 0.28770264983177185, "learning_rate": 0.0002, "epoch": 0.6053550640279395, "step": 780}, {"loss": 1.7025, "grad_norm": 0.2814285457134247, "learning_rate": 0.0002, "epoch": 0.613116026387272, "step": 790}, {"loss": 1.819, "grad_norm": 0.31554412841796875, "learning_rate": 0.0002, "epoch": 0.6208769887466046, "step": 800}, {"loss": 1.8335, "grad_norm": 0.2984226942062378, "learning_rate": 0.0002, "epoch": 0.6286379511059371, "step": 810}, {"loss": 1.7728, "grad_norm": 0.2859906554222107, "learning_rate": 0.0002, "epoch": 0.6363989134652697, "step": 820}, {"loss": 1.7408, "grad_norm": 0.2887928783893585, "learning_rate": 0.0002, "epoch": 0.6441598758246022, "step": 830}, {"loss": 1.8071, "grad_norm": 0.31287339329719543, "learning_rate": 0.0002, "epoch": 0.6519208381839348, "step": 840}, {"loss": 1.7673, "grad_norm": 0.32064181566238403, "learning_rate": 0.0002, "epoch": 0.6596818005432674, "step": 850}, {"loss": 1.7849, "grad_norm": 0.290981650352478, "learning_rate": 0.0002, "epoch": 0.6674427629026, "step": 860}, {"loss": 1.8089, "grad_norm": 0.33060121536254883, "learning_rate": 0.0002, "epoch": 0.6752037252619325, "step": 870}, {"loss": 1.7357, "grad_norm": 0.27032899856567383, "learning_rate": 0.0002, "epoch": 0.682964687621265, "step": 880}, {"loss": 1.8423, "grad_norm": 0.29031234979629517, "learning_rate": 0.0002, "epoch": 0.6907256499805976, "step": 890}, {"loss": 1.835, "grad_norm": 0.2845142185688019, "learning_rate": 0.0002, "epoch": 0.6984866123399301, "step": 900}, {"loss": 1.77, "grad_norm": 0.8638312816619873, "learning_rate": 0.0002, "epoch": 0.7062475746992627, "step": 910}, {"loss": 1.7757, "grad_norm": 0.3086668848991394, "learning_rate": 0.0002, "epoch": 0.7140085370585952, "step": 920}, {"loss": 1.7541, "grad_norm": 0.2724177837371826, "learning_rate": 0.0002, "epoch": 0.7217694994179278, "step": 930}, {"loss": 1.816, "grad_norm": 0.289559006690979, "learning_rate": 0.0002, "epoch": 0.7295304617772603, "step": 940}, {"loss": 1.7654, "grad_norm": 0.3000658452510834, "learning_rate": 0.0002, "epoch": 0.737291424136593, "step": 950}, {"loss": 1.7736, "grad_norm": 0.33544042706489563, "learning_rate": 0.0002, "epoch": 0.7450523864959255, "step": 960}, {"loss": 1.6979, "grad_norm": 0.28593236207962036, "learning_rate": 0.0002, "epoch": 0.7528133488552581, "step": 970}, {"loss": 1.8583, "grad_norm": 0.313634991645813, "learning_rate": 0.0002, "epoch": 0.7605743112145906, "step": 980}, {"loss": 1.7473, "grad_norm": 0.2949385941028595, "learning_rate": 0.0002, "epoch": 0.7683352735739232, "step": 990}, {"loss": 1.8689, "grad_norm": 0.2920108437538147, "learning_rate": 0.0002, "epoch": 0.7760962359332557, "step": 1000}, {"loss": 1.8401, "grad_norm": 0.3245100677013397, "learning_rate": 0.0002, "epoch": 0.7838571982925883, "step": 1010}, {"loss": 1.7109, "grad_norm": 0.3007619380950928, "learning_rate": 0.0002, "epoch": 0.7916181606519208, "step": 1020}, {"loss": 1.7427, "grad_norm": 0.3630852997303009, "learning_rate": 0.0002, "epoch": 0.7993791230112534, "step": 1030}, {"loss": 1.7655, "grad_norm": 0.2856379747390747, "learning_rate": 0.0002, "epoch": 0.8071400853705859, "step": 1040}, {"loss": 1.8371, "grad_norm": 0.32476478815078735, "learning_rate": 0.0002, "epoch": 0.8149010477299186, "step": 1050}, {"loss": 1.8039, "grad_norm": 0.5162565112113953, "learning_rate": 0.0002, "epoch": 0.8226620100892511, "step": 1060}, {"loss": 1.8862, "grad_norm": 0.316496342420578, "learning_rate": 0.0002, "epoch": 0.8304229724485837, "step": 1070}, {"loss": 1.8023, "grad_norm": 0.31977516412734985, "learning_rate": 0.0002, "epoch": 0.8381839348079162, "step": 1080}, {"loss": 1.8547, "grad_norm": 0.269509494304657, "learning_rate": 0.0002, "epoch": 0.8459448971672487, "step": 1090}, {"loss": 1.7811, "grad_norm": 0.31621453166007996, "learning_rate": 0.0002, "epoch": 0.8537058595265813, "step": 1100}, {"loss": 1.739, "grad_norm": 0.2946535050868988, "learning_rate": 0.0002, "epoch": 0.8614668218859138, "step": 1110}, {"loss": 1.7511, "grad_norm": 0.3088909983634949, "learning_rate": 0.0002, "epoch": 0.8692277842452464, "step": 1120}, {"loss": 1.8228, "grad_norm": 0.33033716678619385, "learning_rate": 0.0002, "epoch": 0.8769887466045789, "step": 1130}, {"loss": 1.7912, "grad_norm": 0.2954833507537842, "learning_rate": 0.0002, "epoch": 0.8847497089639115, "step": 1140}, {"loss": 1.8394, "grad_norm": 0.2950248122215271, "learning_rate": 0.0002, "epoch": 0.8925106713232441, "step": 1150}, {"loss": 1.7068, "grad_norm": 0.296661913394928, "learning_rate": 0.0002, "epoch": 0.9002716336825767, "step": 1160}, {"loss": 1.7967, "grad_norm": 0.35451310873031616, "learning_rate": 0.0002, "epoch": 0.9080325960419092, "step": 1170}, {"loss": 1.8202, "grad_norm": 0.32705947756767273, "learning_rate": 0.0002, "epoch": 0.9157935584012418, "step": 1180}, {"loss": 1.7396, "grad_norm": 0.3333960771560669, "learning_rate": 0.0002, "epoch": 0.9235545207605743, "step": 1190}, {"loss": 1.7801, "grad_norm": 0.3042232096195221, "learning_rate": 0.0002, "epoch": 0.9313154831199069, "step": 1200}, {"loss": 1.7586, "grad_norm": 0.281553715467453, "learning_rate": 0.0002, "epoch": 0.9390764454792394, "step": 1210}, {"loss": 1.7953, "grad_norm": 0.3096391558647156, "learning_rate": 0.0002, "epoch": 0.946837407838572, "step": 1220}, {"loss": 1.7401, "grad_norm": 0.2866271734237671, "learning_rate": 0.0002, "epoch": 0.9545983701979045, "step": 1230}, {"loss": 1.7211, "grad_norm": 0.28394097089767456, "learning_rate": 0.0002, "epoch": 0.962359332557237, "step": 1240}, {"loss": 1.7363, "grad_norm": 0.3249266743659973, "learning_rate": 0.0002, "epoch": 0.9701202949165697, "step": 1250}, {"loss": 1.7563, "grad_norm": 0.2896869480609894, "learning_rate": 0.0002, "epoch": 0.9778812572759022, "step": 1260}, {"loss": 1.6389, "grad_norm": 0.29224586486816406, "learning_rate": 0.0002, "epoch": 0.9856422196352348, "step": 1270}, {"loss": 1.7111, "grad_norm": 0.2820223569869995, "learning_rate": 0.0002, "epoch": 0.9934031819945673, "step": 1280}, {"eval_loss": 1.8081045150756836, "eval_runtime": 102.3056, "eval_samples_per_second": 4.956, "eval_steps_per_second": 0.626, "epoch": 0.9996119518820333, "step": 1288}, {"loss": 1.7518, "grad_norm": 0.3282551169395447, "learning_rate": 0.0002, "epoch": 1.0011641443538999, "step": 1290}, {"loss": 1.6806, "grad_norm": 0.30217495560646057, "learning_rate": 0.0002, "epoch": 1.0089251067132325, "step": 1300}, {"loss": 1.6777, "grad_norm": 0.30801767110824585, "learning_rate": 0.0002, "epoch": 1.016686069072565, "step": 1310}, {"loss": 1.7756, "grad_norm": 0.31816792488098145, "learning_rate": 0.0002, "epoch": 1.0244470314318976, "step": 1320}, {"loss": 1.6986, "grad_norm": 0.27794334292411804, "learning_rate": 0.0002, "epoch": 1.03220799379123, "step": 1330}, {"loss": 1.6931, "grad_norm": 0.3018926680088043, "learning_rate": 0.0002, "epoch": 1.0399689561505627, "step": 1340}, {"loss": 1.7033, "grad_norm": 0.3552975356578827, "learning_rate": 0.0002, "epoch": 1.0477299185098952, "step": 1350}, {"loss": 1.6782, "grad_norm": 0.32590144872665405, "learning_rate": 0.0002, "epoch": 1.0554908808692278, "step": 1360}, {"loss": 1.6479, "grad_norm": 0.3435460925102234, "learning_rate": 0.0002, "epoch": 1.0632518432285603, "step": 1370}, {"loss": 1.7451, "grad_norm": 0.35037797689437866, "learning_rate": 0.0002, "epoch": 1.071012805587893, "step": 1380}, {"loss": 1.7868, "grad_norm": 0.31398263573646545, "learning_rate": 0.0002, "epoch": 1.0787737679472253, "step": 1390}, {"loss": 1.6729, "grad_norm": 0.3134010434150696, "learning_rate": 0.0002, "epoch": 1.086534730306558, "step": 1400}, {"loss": 1.751, "grad_norm": 0.4599704444408417, "learning_rate": 0.0002, "epoch": 1.0942956926658907, "step": 1410}, {"loss": 1.6871, "grad_norm": 0.35852891206741333, "learning_rate": 0.0002, "epoch": 1.102056655025223, "step": 1420}, {"loss": 1.7083, "grad_norm": 0.35628634691238403, "learning_rate": 0.0002, "epoch": 1.1098176173845558, "step": 1430}, {"loss": 1.6166, "grad_norm": 0.3769161105155945, "learning_rate": 0.0002, "epoch": 1.1175785797438882, "step": 1440}, {"loss": 1.7344, "grad_norm": 1.3712416887283325, "learning_rate": 0.0002, "epoch": 1.1253395421032208, "step": 1450}, {"loss": 1.6542, "grad_norm": 0.38406670093536377, "learning_rate": 0.0002, "epoch": 1.1331005044625533, "step": 1460}, {"loss": 1.7104, "grad_norm": 0.3402116000652313, "learning_rate": 0.0002, "epoch": 1.140861466821886, "step": 1470}, {"loss": 1.7074, "grad_norm": 0.341189444065094, "learning_rate": 0.0002, "epoch": 1.1486224291812184, "step": 1480}, {"loss": 1.6468, "grad_norm": 0.36629995703697205, "learning_rate": 0.0002, "epoch": 1.156383391540551, "step": 1490}, {"loss": 1.6952, "grad_norm": 0.3499569296836853, "learning_rate": 0.0002, "epoch": 1.1641443538998835, "step": 1500}, {"loss": 1.6625, "grad_norm": 0.3663063943386078, "learning_rate": 0.0002, "epoch": 1.1719053162592161, "step": 1510}, {"loss": 1.7533, "grad_norm": 0.34851500391960144, "learning_rate": 0.0002, "epoch": 1.1796662786185488, "step": 1520}, {"loss": 1.6092, "grad_norm": 0.35071656107902527, "learning_rate": 0.0002, "epoch": 1.1874272409778812, "step": 1530}, {"loss": 1.7206, "grad_norm": 0.42783796787261963, "learning_rate": 0.0002, "epoch": 1.1951882033372139, "step": 1540}, {"loss": 1.7499, "grad_norm": 0.31830692291259766, "learning_rate": 0.0002, "epoch": 1.2029491656965463, "step": 1550}, {"loss": 1.7372, "grad_norm": 0.3597424626350403, "learning_rate": 0.0002, "epoch": 1.210710128055879, "step": 1560}, {"loss": 1.6386, "grad_norm": 0.35233765840530396, "learning_rate": 0.0002, "epoch": 1.2184710904152114, "step": 1570}, {"loss": 1.6766, "grad_norm": 0.35942912101745605, "learning_rate": 0.0002, "epoch": 1.226232052774544, "step": 1580}, {"loss": 1.6598, "grad_norm": 0.36159393191337585, "learning_rate": 0.0002, "epoch": 1.2339930151338767, "step": 1590}, {"loss": 1.6697, "grad_norm": 0.3328469693660736, "learning_rate": 0.0002, "epoch": 1.2417539774932091, "step": 1600}, {"loss": 1.7594, "grad_norm": 0.3089476525783539, "learning_rate": 0.0002, "epoch": 1.2495149398525418, "step": 1610}, {"loss": 1.6805, "grad_norm": 0.30947765707969666, "learning_rate": 0.0002, "epoch": 1.2572759022118742, "step": 1620}, {"loss": 1.6899, "grad_norm": 0.32154011726379395, "learning_rate": 0.0002, "epoch": 1.265036864571207, "step": 1630}, {"loss": 1.6621, "grad_norm": 0.3480297923088074, "learning_rate": 0.0002, "epoch": 1.2727978269305393, "step": 1640}, {"loss": 1.7087, "grad_norm": 0.39471694827079773, "learning_rate": 0.0002, "epoch": 1.280558789289872, "step": 1650}, {"loss": 1.7608, "grad_norm": 0.35728853940963745, "learning_rate": 0.0002, "epoch": 1.2883197516492044, "step": 1660}, {"loss": 1.7008, "grad_norm": 0.35223081707954407, "learning_rate": 0.0002, "epoch": 1.296080714008537, "step": 1670}, {"loss": 1.7253, "grad_norm": 0.3588867485523224, "learning_rate": 0.0002, "epoch": 1.3038416763678695, "step": 1680}, {"loss": 1.6505, "grad_norm": 0.3528042733669281, "learning_rate": 0.0002, "epoch": 1.3116026387272022, "step": 1690}, {"loss": 1.6945, "grad_norm": 0.35975801944732666, "learning_rate": 0.0002, "epoch": 1.3193636010865348, "step": 1700}, {"loss": 1.6631, "grad_norm": 0.36691880226135254, "learning_rate": 0.0002, "epoch": 1.3271245634458673, "step": 1710}, {"loss": 1.7593, "grad_norm": 0.3787977695465088, "learning_rate": 0.0002, "epoch": 1.3348855258052, "step": 1720}, {"loss": 1.7697, "grad_norm": 0.36614933609962463, "learning_rate": 0.0002, "epoch": 1.3426464881645324, "step": 1730}, {"loss": 1.6487, "grad_norm": 0.3484745919704437, "learning_rate": 0.0002, "epoch": 1.350407450523865, "step": 1740}, {"loss": 1.7054, "grad_norm": 0.36905673146247864, "learning_rate": 0.0002, "epoch": 1.3581684128831975, "step": 1750}, {"loss": 1.7679, "grad_norm": 0.41564738750457764, "learning_rate": 0.0002, "epoch": 1.36592937524253, "step": 1760}, {"loss": 1.6634, "grad_norm": 0.3345205783843994, "learning_rate": 0.0002, "epoch": 1.3736903376018628, "step": 1770}, {"loss": 1.7275, "grad_norm": 0.34926071763038635, "learning_rate": 0.0002, "epoch": 1.3814512999611952, "step": 1780}, {"loss": 1.685, "grad_norm": 0.42004233598709106, "learning_rate": 0.0002, "epoch": 1.3892122623205276, "step": 1790}, {"loss": 1.666, "grad_norm": 0.3576236963272095, "learning_rate": 0.0002, "epoch": 1.3969732246798603, "step": 1800}, {"loss": 1.8516, "grad_norm": 0.3586704432964325, "learning_rate": 0.0002, "epoch": 1.404734187039193, "step": 1810}, {"loss": 1.6171, "grad_norm": 0.3943439722061157, "learning_rate": 0.0002, "epoch": 1.4124951493985254, "step": 1820}, {"loss": 1.6865, "grad_norm": 0.3484877049922943, "learning_rate": 0.0002, "epoch": 1.420256111757858, "step": 1830}, {"loss": 1.7205, "grad_norm": 0.3344518840312958, "learning_rate": 0.0002, "epoch": 1.4280170741171905, "step": 1840}, {"loss": 1.6999, "grad_norm": 0.4345698356628418, "learning_rate": 0.0002, "epoch": 1.4357780364765231, "step": 1850}, {"loss": 1.6855, "grad_norm": 0.5525162220001221, "learning_rate": 0.0002, "epoch": 1.4435389988358556, "step": 1860}, {"loss": 1.7143, "grad_norm": 0.37194496393203735, "learning_rate": 0.0002, "epoch": 1.4512999611951882, "step": 1870}, {"loss": 1.7623, "grad_norm": 0.34570157527923584, "learning_rate": 0.0002, "epoch": 1.4590609235545209, "step": 1880}, {"loss": 1.7, "grad_norm": 0.3512282073497772, "learning_rate": 0.0002, "epoch": 1.4668218859138533, "step": 1890}, {"loss": 1.7225, "grad_norm": 0.3443922996520996, "learning_rate": 0.0002, "epoch": 1.4745828482731858, "step": 1900}, {"loss": 1.7393, "grad_norm": 0.3812018036842346, "learning_rate": 0.0002, "epoch": 1.4823438106325184, "step": 1910}, {"loss": 1.7277, "grad_norm": 0.39263492822647095, "learning_rate": 0.0002, "epoch": 1.490104772991851, "step": 1920}, {"loss": 1.6829, "grad_norm": 0.3146156072616577, "learning_rate": 0.0002, "epoch": 1.4978657353511835, "step": 1930}, {"loss": 1.6881, "grad_norm": 0.3653988540172577, "learning_rate": 0.0002, "epoch": 1.505626697710516, "step": 1940}, {"loss": 1.7064, "grad_norm": 0.3966596722602844, "learning_rate": 0.0002, "epoch": 1.5133876600698488, "step": 1950}, {"loss": 1.6942, "grad_norm": 0.3441697359085083, "learning_rate": 0.0002, "epoch": 1.5211486224291813, "step": 1960}, {"loss": 1.7175, "grad_norm": 0.3328564465045929, "learning_rate": 0.0002, "epoch": 1.5289095847885137, "step": 1970}, {"loss": 1.7394, "grad_norm": 0.34068772196769714, "learning_rate": 0.0002, "epoch": 1.5366705471478463, "step": 1980}, {"loss": 1.7016, "grad_norm": 0.3559795916080475, "learning_rate": 0.0002, "epoch": 1.544431509507179, "step": 1990}, {"loss": 1.7102, "grad_norm": 0.37888768315315247, "learning_rate": 0.0002, "epoch": 1.5521924718665114, "step": 2000}, {"loss": 1.7094, "grad_norm": 0.36128363013267517, "learning_rate": 0.0002, "epoch": 1.5599534342258439, "step": 2010}, {"loss": 1.6407, "grad_norm": 0.3643714487552643, "learning_rate": 0.0002, "epoch": 1.5677143965851765, "step": 2020}, {"loss": 1.6777, "grad_norm": 0.3863612115383148, "learning_rate": 0.0002, "epoch": 1.5754753589445092, "step": 2030}, {"loss": 1.6575, "grad_norm": 0.32831457257270813, "learning_rate": 0.0002, "epoch": 1.5832363213038416, "step": 2040}, {"loss": 1.7404, "grad_norm": 0.36098113656044006, "learning_rate": 0.0002, "epoch": 1.5909972836631743, "step": 2050}, {"loss": 1.7065, "grad_norm": 1.1079334020614624, "learning_rate": 0.0002, "epoch": 1.598758246022507, "step": 2060}, {"loss": 1.6824, "grad_norm": 0.35615381598472595, "learning_rate": 0.0002, "epoch": 1.6065192083818394, "step": 2070}, {"loss": 1.7262, "grad_norm": 0.369711309671402, "learning_rate": 0.0002, "epoch": 1.6142801707411718, "step": 2080}, {"loss": 1.6995, "grad_norm": 0.390658438205719, "learning_rate": 0.0002, "epoch": 1.6220411331005045, "step": 2090}, {"loss": 1.6996, "grad_norm": 0.3422999382019043, "learning_rate": 0.0002, "epoch": 1.6298020954598371, "step": 2100}, {"loss": 1.7135, "grad_norm": 0.372475266456604, "learning_rate": 0.0002, "epoch": 1.6375630578191696, "step": 2110}, {"loss": 1.7216, "grad_norm": 0.35660576820373535, "learning_rate": 0.0002, "epoch": 1.645324020178502, "step": 2120}, {"loss": 1.6991, "grad_norm": 0.35754942893981934, "learning_rate": 0.0002, "epoch": 1.6530849825378346, "step": 2130}, {"loss": 1.6779, "grad_norm": 0.34572410583496094, "learning_rate": 0.0002, "epoch": 1.6608459448971673, "step": 2140}, {"loss": 1.6707, "grad_norm": 0.42059701681137085, "learning_rate": 0.0002, "epoch": 1.6686069072564997, "step": 2150}, {"loss": 1.6782, "grad_norm": 0.35200759768486023, "learning_rate": 0.0002, "epoch": 1.6763678696158324, "step": 2160}, {"loss": 1.6869, "grad_norm": 0.3704029321670532, "learning_rate": 0.0002, "epoch": 1.684128831975165, "step": 2170}, {"loss": 1.7192, "grad_norm": 0.40450501441955566, "learning_rate": 0.0002, "epoch": 1.6918897943344975, "step": 2180}, {"loss": 1.6228, "grad_norm": 0.362966924905777, "learning_rate": 0.0002, "epoch": 1.69965075669383, "step": 2190}, {"loss": 1.6935, "grad_norm": 0.36586204171180725, "learning_rate": 0.0002, "epoch": 1.7074117190531626, "step": 2200}, {"loss": 1.6088, "grad_norm": 0.3295372426509857, "learning_rate": 0.0002, "epoch": 1.7151726814124952, "step": 2210}, {"loss": 1.7844, "grad_norm": 0.3892575800418854, "learning_rate": 0.0002, "epoch": 1.7229336437718277, "step": 2220}, {"loss": 1.7805, "grad_norm": 0.34712135791778564, "learning_rate": 0.0002, "epoch": 1.73069460613116, "step": 2230}, {"loss": 1.7353, "grad_norm": 0.34801796078681946, "learning_rate": 0.0002, "epoch": 1.738455568490493, "step": 2240}, {"loss": 1.7009, "grad_norm": 0.3822397291660309, "learning_rate": 0.0002, "epoch": 1.7462165308498254, "step": 2250}, {"loss": 1.6546, "grad_norm": 0.38933250308036804, "learning_rate": 0.0002, "epoch": 1.7539774932091579, "step": 2260}, {"loss": 1.7245, "grad_norm": 0.3798373341560364, "learning_rate": 0.0002, "epoch": 1.7617384555684905, "step": 2270}, {"loss": 1.6508, "grad_norm": 0.35151317715644836, "learning_rate": 0.0002, "epoch": 1.7694994179278232, "step": 2280}, {"loss": 1.6894, "grad_norm": 0.44981494545936584, "learning_rate": 0.0002, "epoch": 1.7772603802871556, "step": 2290}, {"loss": 1.7271, "grad_norm": 0.3992624580860138, "learning_rate": 0.0002, "epoch": 1.785021342646488, "step": 2300}, {"loss": 1.7252, "grad_norm": 0.3772512376308441, "learning_rate": 0.0002, "epoch": 1.7927823050058207, "step": 2310}, {"loss": 1.7057, "grad_norm": 0.3511589467525482, "learning_rate": 0.0002, "epoch": 1.8005432673651534, "step": 2320}, {"loss": 1.764, "grad_norm": 0.3805285394191742, "learning_rate": 0.0002, "epoch": 1.8083042297244858, "step": 2330}, {"loss": 1.6986, "grad_norm": 0.3792071044445038, "learning_rate": 0.0002, "epoch": 1.8160651920838184, "step": 2340}, {"loss": 1.7759, "grad_norm": 0.36430829763412476, "learning_rate": 0.0002, "epoch": 1.823826154443151, "step": 2350}, {"loss": 1.6773, "grad_norm": 0.36502477526664734, "learning_rate": 0.0002, "epoch": 1.8315871168024835, "step": 2360}, {"loss": 1.8072, "grad_norm": 0.35015153884887695, "learning_rate": 0.0002, "epoch": 1.839348079161816, "step": 2370}, {"loss": 1.7734, "grad_norm": 0.3710903823375702, "learning_rate": 0.0002, "epoch": 1.8471090415211486, "step": 2380}, {"loss": 1.6737, "grad_norm": 0.3542828857898712, "learning_rate": 0.0002, "epoch": 1.8548700038804813, "step": 2390}, {"loss": 1.6783, "grad_norm": 0.35467568039894104, "learning_rate": 0.0002, "epoch": 1.8626309662398137, "step": 2400}, {"loss": 1.7773, "grad_norm": 0.3638560473918915, "learning_rate": 0.0002, "epoch": 1.8703919285991462, "step": 2410}, {"loss": 1.7019, "grad_norm": 0.3823298215866089, "learning_rate": 0.0002, "epoch": 1.8781528909584788, "step": 2420}, {"loss": 1.6935, "grad_norm": 0.3926416337490082, "learning_rate": 0.0002, "epoch": 1.8859138533178115, "step": 2430}, {"loss": 1.71, "grad_norm": 0.3608079254627228, "learning_rate": 0.0002, "epoch": 1.893674815677144, "step": 2440}, {"loss": 1.6654, "grad_norm": 0.3426613509654999, "learning_rate": 0.0002, "epoch": 1.9014357780364766, "step": 2450}, {"loss": 1.6892, "grad_norm": 0.3522338569164276, "learning_rate": 0.0002, "epoch": 1.9091967403958092, "step": 2460}, {"loss": 1.7307, "grad_norm": 0.3608049154281616, "learning_rate": 0.0002, "epoch": 1.9169577027551417, "step": 2470}, {"loss": 1.6823, "grad_norm": 0.3849755525588989, "learning_rate": 0.0002, "epoch": 1.924718665114474, "step": 2480}, {"loss": 1.7518, "grad_norm": 0.4154011011123657, "learning_rate": 0.0002, "epoch": 1.9324796274738067, "step": 2490}, {"loss": 1.7381, "grad_norm": 0.3602796792984009, "learning_rate": 0.0002, "epoch": 1.9402405898331394, "step": 2500}, {"loss": 1.7843, "grad_norm": 0.3702992796897888, "learning_rate": 0.0002, "epoch": 1.9480015521924718, "step": 2510}, {"loss": 1.6669, "grad_norm": 0.3657735288143158, "learning_rate": 0.0002, "epoch": 1.9557625145518043, "step": 2520}, {"loss": 1.5964, "grad_norm": 0.41031739115715027, "learning_rate": 0.0002, "epoch": 1.963523476911137, "step": 2530}, {"loss": 1.6745, "grad_norm": 0.34578680992126465, "learning_rate": 0.0002, "epoch": 1.9712844392704696, "step": 2540}, {"loss": 1.723, "grad_norm": 0.3361521065235138, "learning_rate": 0.0002, "epoch": 1.979045401629802, "step": 2550}, {"loss": 1.6868, "grad_norm": 0.34342363476753235, "learning_rate": 0.0002, "epoch": 1.9868063639891347, "step": 2560}, {"loss": 1.6577, "grad_norm": 0.32954007387161255, "learning_rate": 0.0002, "epoch": 1.9945673263484673, "step": 2570}, {"eval_loss": 1.8068748712539673, "eval_runtime": 105.5885, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.606, "epoch": 2.0, "step": 2577}, {"loss": 1.634, "grad_norm": 0.336302250623703, "learning_rate": 0.0002, "epoch": 2.0023282887077998, "step": 2580}, {"loss": 1.612, "grad_norm": 0.3627048432826996, "learning_rate": 0.0002, "epoch": 2.010089251067132, "step": 2590}, {"loss": 1.4908, "grad_norm": 0.38406702876091003, "learning_rate": 0.0002, "epoch": 2.017850213426465, "step": 2600}, {"loss": 1.5368, "grad_norm": 0.5326781272888184, "learning_rate": 0.0002, "epoch": 2.0256111757857975, "step": 2610}, {"loss": 1.5727, "grad_norm": 0.4774554967880249, "learning_rate": 0.0002, "epoch": 2.03337213814513, "step": 2620}, {"loss": 1.5422, "grad_norm": 0.4251810312271118, "learning_rate": 0.0002, "epoch": 2.0411331005044624, "step": 2630}, {"loss": 1.5152, "grad_norm": 0.4693007171154022, "learning_rate": 0.0002, "epoch": 2.0488940628637953, "step": 2640}, {"loss": 1.6137, "grad_norm": 0.46371519565582275, "learning_rate": 0.0002, "epoch": 2.0566550252231277, "step": 2650}, {"loss": 1.6304, "grad_norm": 0.46652570366859436, "learning_rate": 0.0002, "epoch": 2.06441598758246, "step": 2660}, {"loss": 1.6022, "grad_norm": 0.45200315117836, "learning_rate": 0.0002, "epoch": 2.0721769499417926, "step": 2670}, {"loss": 1.5358, "grad_norm": 0.42905205488204956, "learning_rate": 0.0002, "epoch": 2.0799379123011255, "step": 2680}, {"loss": 1.5401, "grad_norm": 0.44509148597717285, "learning_rate": 0.0002, "epoch": 2.087698874660458, "step": 2690}, {"loss": 1.5303, "grad_norm": 0.4445319175720215, "learning_rate": 0.0002, "epoch": 2.0954598370197903, "step": 2700}, {"loss": 1.5701, "grad_norm": 0.46825504302978516, "learning_rate": 0.0002, "epoch": 2.103220799379123, "step": 2710}, {"loss": 1.5751, "grad_norm": 0.4623856842517853, "learning_rate": 0.0002, "epoch": 2.1109817617384556, "step": 2720}, {"loss": 1.5601, "grad_norm": 0.4833452105522156, "learning_rate": 0.0002, "epoch": 2.118742724097788, "step": 2730}, {"loss": 1.5997, "grad_norm": 0.4582686722278595, "learning_rate": 0.0002, "epoch": 2.1265036864571205, "step": 2740}, {"loss": 1.5801, "grad_norm": 0.47587934136390686, "learning_rate": 0.0002, "epoch": 2.1342646488164534, "step": 2750}, {"loss": 1.594, "grad_norm": 0.4602217972278595, "learning_rate": 0.0002, "epoch": 2.142025611175786, "step": 2760}, {"loss": 1.5271, "grad_norm": 0.47501352429389954, "learning_rate": 0.0002, "epoch": 2.1497865735351183, "step": 2770}, {"loss": 1.4862, "grad_norm": 0.5078499913215637, "learning_rate": 0.0002, "epoch": 2.1575475358944507, "step": 2780}, {"loss": 1.6236, "grad_norm": 0.497704416513443, "learning_rate": 0.0002, "epoch": 2.1653084982537836, "step": 2790}, {"loss": 1.5597, "grad_norm": 0.5435971617698669, "learning_rate": 0.0002, "epoch": 2.173069460613116, "step": 2800}, {"loss": 1.5926, "grad_norm": 0.5172356367111206, "learning_rate": 0.0002, "epoch": 2.1808304229724484, "step": 2810}, {"loss": 1.5202, "grad_norm": 0.44063422083854675, "learning_rate": 0.0002, "epoch": 2.1885913853317813, "step": 2820}, {"loss": 1.6041, "grad_norm": 0.5079569220542908, "learning_rate": 0.0002, "epoch": 2.1963523476911138, "step": 2830}, {"loss": 1.5915, "grad_norm": 0.45658132433891296, "learning_rate": 0.0002, "epoch": 2.204113310050446, "step": 2840}, {"loss": 1.5546, "grad_norm": 0.5103023648262024, "learning_rate": 0.0002, "epoch": 2.2118742724097786, "step": 2850}, {"loss": 1.6197, "grad_norm": 0.4882226288318634, "learning_rate": 0.0002, "epoch": 2.2196352347691115, "step": 2860}, {"loss": 1.5996, "grad_norm": 0.5087296962738037, "learning_rate": 0.0002, "epoch": 2.227396197128444, "step": 2870}, {"loss": 1.5451, "grad_norm": 0.45293712615966797, "learning_rate": 0.0002, "epoch": 2.2351571594877764, "step": 2880}, {"loss": 1.6214, "grad_norm": 0.5120379328727722, "learning_rate": 0.0002, "epoch": 2.242918121847109, "step": 2890}, {"loss": 1.5273, "grad_norm": 0.47126415371894836, "learning_rate": 0.0002, "epoch": 2.2506790842064417, "step": 2900}, {"loss": 1.612, "grad_norm": 0.44005846977233887, "learning_rate": 0.0002, "epoch": 2.258440046565774, "step": 2910}, {"loss": 1.6023, "grad_norm": 0.46476176381111145, "learning_rate": 0.0002, "epoch": 2.2662010089251066, "step": 2920}, {"loss": 1.6417, "grad_norm": 0.48051515221595764, "learning_rate": 0.0002, "epoch": 2.2739619712844394, "step": 2930}, {"loss": 1.587, "grad_norm": 0.480069637298584, "learning_rate": 0.0002, "epoch": 2.281722933643772, "step": 2940}, {"loss": 1.5747, "grad_norm": 0.5122102499008179, "learning_rate": 0.0002, "epoch": 2.2894838960031043, "step": 2950}, {"loss": 1.5183, "grad_norm": 0.48879891633987427, "learning_rate": 0.0002, "epoch": 2.2972448583624367, "step": 2960}, {"loss": 1.5483, "grad_norm": 0.4973136782646179, "learning_rate": 0.0002, "epoch": 2.3050058207217696, "step": 2970}, {"loss": 1.677, "grad_norm": 0.5522695183753967, "learning_rate": 0.0002, "epoch": 2.312766783081102, "step": 2980}, {"loss": 1.5946, "grad_norm": 0.5220217704772949, "learning_rate": 0.0002, "epoch": 2.3205277454404345, "step": 2990}, {"loss": 1.6299, "grad_norm": 0.4978662431240082, "learning_rate": 0.0002, "epoch": 2.328288707799767, "step": 3000}, {"loss": 1.5498, "grad_norm": 0.554053544998169, "learning_rate": 0.0002, "epoch": 2.3360496701591, "step": 3010}, {"loss": 1.5356, "grad_norm": 0.4703886806964874, "learning_rate": 0.0002, "epoch": 2.3438106325184322, "step": 3020}, {"loss": 1.5418, "grad_norm": 0.5074123740196228, "learning_rate": 0.0002, "epoch": 2.3515715948777647, "step": 3030}, {"loss": 1.6873, "grad_norm": 0.5088278651237488, "learning_rate": 0.0002, "epoch": 2.3593325572370976, "step": 3040}, {"loss": 1.5249, "grad_norm": 0.4752114415168762, "learning_rate": 0.0002, "epoch": 2.36709351959643, "step": 3050}, {"loss": 1.5353, "grad_norm": 0.5121659636497498, "learning_rate": 0.0002, "epoch": 2.3748544819557624, "step": 3060}, {"loss": 1.6426, "grad_norm": 0.48649218678474426, "learning_rate": 0.0002, "epoch": 2.3826154443150953, "step": 3070}, {"loss": 1.6136, "grad_norm": 0.5209488868713379, "learning_rate": 0.0002, "epoch": 2.3903764066744277, "step": 3080}, {"loss": 1.597, "grad_norm": 0.5110517740249634, "learning_rate": 0.0002, "epoch": 2.39813736903376, "step": 3090}, {"loss": 1.5773, "grad_norm": 0.5609337091445923, "learning_rate": 0.0002, "epoch": 2.4058983313930926, "step": 3100}, {"loss": 1.5438, "grad_norm": 0.5191826224327087, "learning_rate": 0.0002, "epoch": 2.4136592937524255, "step": 3110}, {"loss": 1.6347, "grad_norm": 0.4876069724559784, "learning_rate": 0.0002, "epoch": 2.421420256111758, "step": 3120}, {"loss": 1.5565, "grad_norm": 0.4713933765888214, "learning_rate": 0.0002, "epoch": 2.4291812184710904, "step": 3130}, {"loss": 1.6388, "grad_norm": 0.5102227330207825, "learning_rate": 0.0002, "epoch": 2.436942180830423, "step": 3140}, {"loss": 1.5667, "grad_norm": 0.44546666741371155, "learning_rate": 0.0002, "epoch": 2.4447031431897557, "step": 3150}, {"loss": 1.5973, "grad_norm": 0.5167558193206787, "learning_rate": 0.0002, "epoch": 2.452464105549088, "step": 3160}, {"loss": 1.5673, "grad_norm": 0.5226958990097046, "learning_rate": 0.0002, "epoch": 2.4602250679084205, "step": 3170}, {"loss": 1.5758, "grad_norm": 0.4751799702644348, "learning_rate": 0.0002, "epoch": 2.4679860302677534, "step": 3180}, {"loss": 1.6234, "grad_norm": 0.4744729697704315, "learning_rate": 0.0002, "epoch": 2.475746992627086, "step": 3190}, {"loss": 1.5661, "grad_norm": 0.5203230381011963, "learning_rate": 0.0002, "epoch": 2.4835079549864183, "step": 3200}, {"loss": 1.493, "grad_norm": 0.47209781408309937, "learning_rate": 0.0002, "epoch": 2.4912689173457507, "step": 3210}, {"loss": 1.6415, "grad_norm": 0.5241674780845642, "learning_rate": 0.0002, "epoch": 2.4990298797050836, "step": 3220}, {"loss": 1.6324, "grad_norm": 0.5152244567871094, "learning_rate": 0.0002, "epoch": 2.506790842064416, "step": 3230}, {"loss": 1.6248, "grad_norm": 0.5216741561889648, "learning_rate": 0.0002, "epoch": 2.5145518044237485, "step": 3240}, {"loss": 1.5668, "grad_norm": 0.4953259527683258, "learning_rate": 0.0002, "epoch": 2.522312766783081, "step": 3250}, {"loss": 1.666, "grad_norm": 0.5973829030990601, "learning_rate": 0.0002, "epoch": 2.530073729142414, "step": 3260}, {"loss": 1.5295, "grad_norm": 0.48804202675819397, "learning_rate": 0.0002, "epoch": 2.5378346915017462, "step": 3270}, {"loss": 1.4954, "grad_norm": 0.5334644317626953, "learning_rate": 0.0002, "epoch": 2.5455956538610787, "step": 3280}, {"loss": 1.5814, "grad_norm": 0.46873313188552856, "learning_rate": 0.0002, "epoch": 2.5533566162204115, "step": 3290}, {"loss": 1.5362, "grad_norm": 0.4282589554786682, "learning_rate": 0.0002, "epoch": 2.561117578579744, "step": 3300}, {"loss": 1.6278, "grad_norm": 0.4848293960094452, "learning_rate": 0.0002, "epoch": 2.5688785409390764, "step": 3310}, {"loss": 1.6308, "grad_norm": 0.5093745589256287, "learning_rate": 0.0002, "epoch": 2.576639503298409, "step": 3320}, {"loss": 1.6375, "grad_norm": 0.5084842443466187, "learning_rate": 0.0002, "epoch": 2.5844004656577413, "step": 3330}, {"loss": 1.6168, "grad_norm": 0.4696281850337982, "learning_rate": 0.0002, "epoch": 2.592161428017074, "step": 3340}, {"loss": 1.5359, "grad_norm": 0.5767765641212463, "learning_rate": 0.0002, "epoch": 2.5999223903764066, "step": 3350}, {"loss": 1.6097, "grad_norm": 0.47300875186920166, "learning_rate": 0.0002, "epoch": 2.607683352735739, "step": 3360}, {"loss": 1.6138, "grad_norm": 0.4809158146381378, "learning_rate": 0.0002, "epoch": 2.615444315095072, "step": 3370}, {"loss": 1.4952, "grad_norm": 0.5141063928604126, "learning_rate": 0.0002, "epoch": 2.6232052774544043, "step": 3380}, {"loss": 1.5784, "grad_norm": 0.4832935035228729, "learning_rate": 0.0002, "epoch": 2.630966239813737, "step": 3390}, {"loss": 1.5796, "grad_norm": 0.5044625401496887, "learning_rate": 0.0002, "epoch": 2.6387272021730697, "step": 3400}, {"loss": 1.6202, "grad_norm": 0.5287680625915527, "learning_rate": 0.0002, "epoch": 2.646488164532402, "step": 3410}, {"loss": 1.5423, "grad_norm": 0.5306379795074463, "learning_rate": 0.0002, "epoch": 2.6542491268917345, "step": 3420}, {"loss": 1.5264, "grad_norm": 0.5849291682243347, "learning_rate": 0.0002, "epoch": 2.662010089251067, "step": 3430}, {"loss": 1.5937, "grad_norm": 0.7951080799102783, "learning_rate": 0.0002, "epoch": 2.6697710516104, "step": 3440}, {"loss": 1.5791, "grad_norm": 0.48087653517723083, "learning_rate": 0.0002, "epoch": 2.6775320139697323, "step": 3450}, {"loss": 1.6769, "grad_norm": 0.5396431684494019, "learning_rate": 0.0002, "epoch": 2.6852929763290647, "step": 3460}, {"loss": 1.606, "grad_norm": 0.5481634736061096, "learning_rate": 0.0002, "epoch": 2.693053938688397, "step": 3470}, {"loss": 1.6436, "grad_norm": 0.5068731307983398, "learning_rate": 0.0002, "epoch": 2.70081490104773, "step": 3480}, {"loss": 1.5738, "grad_norm": 0.5759826898574829, "learning_rate": 0.0002, "epoch": 2.7085758634070625, "step": 3490}, {"loss": 1.596, "grad_norm": 0.7253932952880859, "learning_rate": 0.0002, "epoch": 2.716336825766395, "step": 3500}, {"loss": 1.5791, "grad_norm": 0.527745246887207, "learning_rate": 0.0002, "epoch": 2.724097788125728, "step": 3510}, {"loss": 1.5874, "grad_norm": 0.5279242396354675, "learning_rate": 0.0002, "epoch": 2.73185875048506, "step": 3520}, {"loss": 1.6768, "grad_norm": 0.5047839283943176, "learning_rate": 0.0002, "epoch": 2.7396197128443927, "step": 3530}, {"loss": 1.5517, "grad_norm": 0.5430883169174194, "learning_rate": 0.0002, "epoch": 2.7473806752037255, "step": 3540}, {"loss": 1.5624, "grad_norm": 0.4496723711490631, "learning_rate": 0.0002, "epoch": 2.755141637563058, "step": 3550}, {"loss": 1.5789, "grad_norm": 0.5063338875770569, "learning_rate": 0.0002, "epoch": 2.7629025999223904, "step": 3560}, {"loss": 1.52, "grad_norm": 0.4619026780128479, "learning_rate": 0.0002, "epoch": 2.770663562281723, "step": 3570}, {"loss": 1.5793, "grad_norm": 0.4753304123878479, "learning_rate": 0.0002, "epoch": 2.7784245246410553, "step": 3580}, {"loss": 1.5715, "grad_norm": 0.5422708988189697, "learning_rate": 0.0002, "epoch": 2.786185487000388, "step": 3590}, {"loss": 1.5926, "grad_norm": 0.4756578803062439, "learning_rate": 0.0002, "epoch": 2.7939464493597206, "step": 3600}, {"loss": 1.5358, "grad_norm": 0.5057567358016968, "learning_rate": 0.0002, "epoch": 2.801707411719053, "step": 3610}, {"loss": 1.6131, "grad_norm": 0.5410919785499573, "learning_rate": 0.0002, "epoch": 2.809468374078386, "step": 3620}, {"loss": 1.5573, "grad_norm": 0.4958136975765228, "learning_rate": 0.0002, "epoch": 2.8172293364377183, "step": 3630}, {"loss": 1.6324, "grad_norm": 0.454527348279953, "learning_rate": 0.0002, "epoch": 2.8249902987970508, "step": 3640}, {"loss": 1.5582, "grad_norm": 0.5092706084251404, "learning_rate": 0.0002, "epoch": 2.8327512611563836, "step": 3650}, {"loss": 1.5893, "grad_norm": 0.5314022302627563, "learning_rate": 0.0002, "epoch": 2.840512223515716, "step": 3660}, {"loss": 1.588, "grad_norm": 0.5028239488601685, "learning_rate": 0.0002, "epoch": 2.8482731858750485, "step": 3670}, {"loss": 1.5751, "grad_norm": 0.5127444863319397, "learning_rate": 0.0002, "epoch": 2.856034148234381, "step": 3680}, {"loss": 1.6018, "grad_norm": 0.5045645236968994, "learning_rate": 0.0002, "epoch": 2.8637951105937134, "step": 3690}, {"loss": 1.5788, "grad_norm": 0.5560781955718994, "learning_rate": 0.0002, "epoch": 2.8715560729530463, "step": 3700}, {"loss": 1.5988, "grad_norm": 0.5177600383758545, "learning_rate": 0.0002, "epoch": 2.8793170353123787, "step": 3710}, {"loss": 1.6009, "grad_norm": 0.45830899477005005, "learning_rate": 0.0002, "epoch": 2.887077997671711, "step": 3720}, {"loss": 1.6344, "grad_norm": 0.4828629195690155, "learning_rate": 0.0002, "epoch": 2.894838960031044, "step": 3730}, {"loss": 1.6758, "grad_norm": 0.48241183161735535, "learning_rate": 0.0002, "epoch": 2.9025999223903765, "step": 3740}, {"loss": 1.5649, "grad_norm": 0.4909592568874359, "learning_rate": 0.0002, "epoch": 2.910360884749709, "step": 3750}, {"loss": 1.4927, "grad_norm": 0.44677025079727173, "learning_rate": 0.0002, "epoch": 2.9181218471090418, "step": 3760}, {"loss": 1.5067, "grad_norm": 0.4928834140300751, "learning_rate": 0.0002, "epoch": 2.925882809468374, "step": 3770}, {"loss": 1.5843, "grad_norm": 0.5673553347587585, "learning_rate": 0.0002, "epoch": 2.9336437718277066, "step": 3780}, {"loss": 1.5566, "grad_norm": 0.548190712928772, "learning_rate": 0.0002, "epoch": 2.941404734187039, "step": 3790}, {"loss": 1.5892, "grad_norm": 0.48979803919792175, "learning_rate": 0.0002, "epoch": 2.9491656965463715, "step": 3800}, {"loss": 1.5589, "grad_norm": 0.533191978931427, "learning_rate": 0.0002, "epoch": 2.9569266589057044, "step": 3810}, {"loss": 1.584, "grad_norm": 0.5362946391105652, "learning_rate": 0.0002, "epoch": 2.964687621265037, "step": 3820}, {"loss": 1.6602, "grad_norm": 0.4724906384944916, "learning_rate": 0.0002, "epoch": 2.9724485836243693, "step": 3830}, {"loss": 1.5834, "grad_norm": 0.5468461513519287, "learning_rate": 0.0002, "epoch": 2.980209545983702, "step": 3840}, {"loss": 1.6316, "grad_norm": 0.4697108864784241, "learning_rate": 0.0002, "epoch": 2.9879705083430346, "step": 3850}, {"loss": 1.6312, "grad_norm": 0.4780906140804291, "learning_rate": 0.0002, "epoch": 2.995731470702367, "step": 3860}]} +{"epoch": 4.0, "step": 5154, "epoch_duration": 3651.1835963726044, "total_accumulated_duration": 14274.202889204025, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0855, "grad_norm": 1.0751162767410278, "learning_rate": 0.0002, "epoch": 0.007760962359332557, "step": 10}, {"loss": 2.4744, "grad_norm": 0.4697345793247223, "learning_rate": 0.0002, "epoch": 0.015521924718665115, "step": 20}, {"loss": 2.193, "grad_norm": 0.5370839238166809, "learning_rate": 0.0002, "epoch": 0.023282887077997673, "step": 30}, {"loss": 2.0599, "grad_norm": 0.46794816851615906, "learning_rate": 0.0002, "epoch": 0.03104384943733023, "step": 40}, {"loss": 1.9354, "grad_norm": 0.44624820351600647, "learning_rate": 0.0002, "epoch": 0.038804811796662786, "step": 50}, {"loss": 1.9319, "grad_norm": 0.3953201472759247, "learning_rate": 0.0002, "epoch": 0.046565774155995346, "step": 60}, {"loss": 1.9099, "grad_norm": 0.3935912549495697, "learning_rate": 0.0002, "epoch": 0.0543267365153279, "step": 70}, {"loss": 1.8795, "grad_norm": 0.4520699381828308, "learning_rate": 0.0002, "epoch": 0.06208769887466046, "step": 80}, {"loss": 1.8354, "grad_norm": 0.3801847994327545, "learning_rate": 0.0002, "epoch": 0.06984866123399301, "step": 90}, {"loss": 1.9053, "grad_norm": 0.4020165205001831, "learning_rate": 0.0002, "epoch": 0.07760962359332557, "step": 100}, {"loss": 1.8779, "grad_norm": 0.3860672116279602, "learning_rate": 0.0002, "epoch": 0.08537058595265813, "step": 110}, {"loss": 1.8731, "grad_norm": 0.3681113123893738, "learning_rate": 0.0002, "epoch": 0.09313154831199069, "step": 120}, {"loss": 1.8157, "grad_norm": 0.3594866991043091, "learning_rate": 0.0002, "epoch": 0.10089251067132324, "step": 130}, {"loss": 1.8266, "grad_norm": 0.3879193663597107, "learning_rate": 0.0002, "epoch": 0.1086534730306558, "step": 140}, {"loss": 1.8818, "grad_norm": 0.3270505666732788, "learning_rate": 0.0002, "epoch": 0.11641443538998836, "step": 150}, {"loss": 1.87, "grad_norm": 0.36824458837509155, "learning_rate": 0.0002, "epoch": 0.12417539774932092, "step": 160}, {"loss": 1.8305, "grad_norm": 0.383882075548172, "learning_rate": 0.0002, "epoch": 0.13193636010865348, "step": 170}, {"loss": 1.8584, "grad_norm": 0.3368665874004364, "learning_rate": 0.0002, "epoch": 0.13969732246798602, "step": 180}, {"loss": 1.7882, "grad_norm": 0.35961097478866577, "learning_rate": 0.0002, "epoch": 0.1474582848273186, "step": 190}, {"loss": 1.8467, "grad_norm": 0.3415963351726532, "learning_rate": 0.0002, "epoch": 0.15521924718665114, "step": 200}, {"loss": 1.8543, "grad_norm": 0.4100632071495056, "learning_rate": 0.0002, "epoch": 0.1629802095459837, "step": 210}, {"loss": 1.8226, "grad_norm": 0.3516307473182678, "learning_rate": 0.0002, "epoch": 0.17074117190531626, "step": 220}, {"loss": 1.7386, "grad_norm": 0.37919050455093384, "learning_rate": 0.0002, "epoch": 0.1785021342646488, "step": 230}, {"loss": 1.7937, "grad_norm": 0.33270683884620667, "learning_rate": 0.0002, "epoch": 0.18626309662398138, "step": 240}, {"loss": 1.7925, "grad_norm": 0.3348783254623413, "learning_rate": 0.0002, "epoch": 0.19402405898331393, "step": 250}, {"loss": 1.7774, "grad_norm": 0.3888475298881531, "learning_rate": 0.0002, "epoch": 0.20178502134264648, "step": 260}, {"loss": 1.8381, "grad_norm": 0.3554602861404419, "learning_rate": 0.0002, "epoch": 0.20954598370197905, "step": 270}, {"loss": 1.8359, "grad_norm": 0.33277708292007446, "learning_rate": 0.0002, "epoch": 0.2173069460613116, "step": 280}, {"loss": 1.7713, "grad_norm": 0.3281584680080414, "learning_rate": 0.0002, "epoch": 0.22506790842064417, "step": 290}, {"loss": 1.8181, "grad_norm": 0.3185969591140747, "learning_rate": 0.0002, "epoch": 0.23282887077997672, "step": 300}, {"loss": 1.8595, "grad_norm": 0.35335442423820496, "learning_rate": 0.0002, "epoch": 0.24058983313930926, "step": 310}, {"loss": 1.87, "grad_norm": 0.3119595944881439, "learning_rate": 0.0002, "epoch": 0.24835079549864184, "step": 320}, {"loss": 1.8357, "grad_norm": 0.36424458026885986, "learning_rate": 0.0002, "epoch": 0.2561117578579744, "step": 330}, {"loss": 1.8003, "grad_norm": 0.3618951141834259, "learning_rate": 0.0002, "epoch": 0.26387272021730696, "step": 340}, {"loss": 1.8221, "grad_norm": 0.312757670879364, "learning_rate": 0.0002, "epoch": 0.2716336825766395, "step": 350}, {"loss": 1.9031, "grad_norm": 0.326016366481781, "learning_rate": 0.0002, "epoch": 0.27939464493597205, "step": 360}, {"loss": 1.8214, "grad_norm": 0.34093883633613586, "learning_rate": 0.0002, "epoch": 0.2871556072953046, "step": 370}, {"loss": 1.7733, "grad_norm": 0.32325029373168945, "learning_rate": 0.0002, "epoch": 0.2949165696546372, "step": 380}, {"loss": 1.842, "grad_norm": 0.34105437994003296, "learning_rate": 0.0002, "epoch": 0.30267753201396974, "step": 390}, {"loss": 1.7926, "grad_norm": 0.32565295696258545, "learning_rate": 0.0002, "epoch": 0.3104384943733023, "step": 400}, {"loss": 1.8031, "grad_norm": 0.32742050290107727, "learning_rate": 0.0002, "epoch": 0.31819945673263483, "step": 410}, {"loss": 1.907, "grad_norm": 0.30233046412467957, "learning_rate": 0.0002, "epoch": 0.3259604190919674, "step": 420}, {"loss": 1.7623, "grad_norm": 0.32419222593307495, "learning_rate": 0.0002, "epoch": 0.3337213814513, "step": 430}, {"loss": 1.865, "grad_norm": 0.3653007745742798, "learning_rate": 0.0002, "epoch": 0.3414823438106325, "step": 440}, {"loss": 1.8044, "grad_norm": 0.31617099046707153, "learning_rate": 0.0002, "epoch": 0.3492433061699651, "step": 450}, {"loss": 1.7677, "grad_norm": 0.3305962085723877, "learning_rate": 0.0002, "epoch": 0.3570042685292976, "step": 460}, {"loss": 1.8155, "grad_norm": 0.3178933262825012, "learning_rate": 0.0002, "epoch": 0.36476523088863017, "step": 470}, {"loss": 1.7485, "grad_norm": 0.37163782119750977, "learning_rate": 0.0002, "epoch": 0.37252619324796277, "step": 480}, {"loss": 1.8804, "grad_norm": 0.469844788312912, "learning_rate": 0.0002, "epoch": 0.3802871556072953, "step": 490}, {"loss": 1.8343, "grad_norm": 0.3409338593482971, "learning_rate": 0.0002, "epoch": 0.38804811796662786, "step": 500}, {"loss": 1.8433, "grad_norm": 0.31943467259407043, "learning_rate": 0.0002, "epoch": 0.3958090803259604, "step": 510}, {"loss": 1.7873, "grad_norm": 0.32293614745140076, "learning_rate": 0.0002, "epoch": 0.40357004268529295, "step": 520}, {"loss": 1.8584, "grad_norm": 0.2994382977485657, "learning_rate": 0.0002, "epoch": 0.41133100504462555, "step": 530}, {"loss": 1.8153, "grad_norm": 0.3273141384124756, "learning_rate": 0.0002, "epoch": 0.4190919674039581, "step": 540}, {"loss": 1.8097, "grad_norm": 0.3020550012588501, "learning_rate": 0.0002, "epoch": 0.42685292976329064, "step": 550}, {"loss": 1.8551, "grad_norm": 0.30113112926483154, "learning_rate": 0.0002, "epoch": 0.4346138921226232, "step": 560}, {"loss": 1.8084, "grad_norm": 0.30274903774261475, "learning_rate": 0.0002, "epoch": 0.44237485448195574, "step": 570}, {"loss": 1.7673, "grad_norm": 0.3231128454208374, "learning_rate": 0.0002, "epoch": 0.45013581684128834, "step": 580}, {"loss": 1.7848, "grad_norm": 0.3255121409893036, "learning_rate": 0.0002, "epoch": 0.4578967792006209, "step": 590}, {"loss": 1.8227, "grad_norm": 0.30147507786750793, "learning_rate": 0.0002, "epoch": 0.46565774155995343, "step": 600}, {"loss": 1.7572, "grad_norm": 0.29781386256217957, "learning_rate": 0.0002, "epoch": 0.473418703919286, "step": 610}, {"loss": 1.8307, "grad_norm": 0.30914685130119324, "learning_rate": 0.0002, "epoch": 0.4811796662786185, "step": 620}, {"loss": 1.805, "grad_norm": 0.3110593855381012, "learning_rate": 0.0002, "epoch": 0.4889406286379511, "step": 630}, {"loss": 1.8228, "grad_norm": 0.3298132121562958, "learning_rate": 0.0002, "epoch": 0.49670159099728367, "step": 640}, {"loss": 1.7816, "grad_norm": 0.322122186422348, "learning_rate": 0.0002, "epoch": 0.5044625533566163, "step": 650}, {"loss": 1.8001, "grad_norm": 0.3504371643066406, "learning_rate": 0.0002, "epoch": 0.5122235157159488, "step": 660}, {"loss": 1.8682, "grad_norm": 0.3102182149887085, "learning_rate": 0.0002, "epoch": 0.5199844780752814, "step": 670}, {"loss": 1.7494, "grad_norm": 0.6113658547401428, "learning_rate": 0.0002, "epoch": 0.5277454404346139, "step": 680}, {"loss": 1.7096, "grad_norm": 0.31841862201690674, "learning_rate": 0.0002, "epoch": 0.5355064027939465, "step": 690}, {"loss": 1.7587, "grad_norm": 0.2830526530742645, "learning_rate": 0.0002, "epoch": 0.543267365153279, "step": 700}, {"loss": 1.7887, "grad_norm": 0.3048769533634186, "learning_rate": 0.0002, "epoch": 0.5510283275126115, "step": 710}, {"loss": 1.8416, "grad_norm": 0.2719033658504486, "learning_rate": 0.0002, "epoch": 0.5587892898719441, "step": 720}, {"loss": 1.786, "grad_norm": 0.3176722526550293, "learning_rate": 0.0002, "epoch": 0.5665502522312766, "step": 730}, {"loss": 1.7127, "grad_norm": 0.32491734623908997, "learning_rate": 0.0002, "epoch": 0.5743112145906092, "step": 740}, {"loss": 1.7892, "grad_norm": 0.32746851444244385, "learning_rate": 0.0002, "epoch": 0.5820721769499418, "step": 750}, {"loss": 1.7811, "grad_norm": 0.3055773973464966, "learning_rate": 0.0002, "epoch": 0.5898331393092744, "step": 760}, {"loss": 1.8597, "grad_norm": 0.30671584606170654, "learning_rate": 0.0002, "epoch": 0.5975941016686069, "step": 770}, {"loss": 1.7728, "grad_norm": 0.28770264983177185, "learning_rate": 0.0002, "epoch": 0.6053550640279395, "step": 780}, {"loss": 1.7025, "grad_norm": 0.2814285457134247, "learning_rate": 0.0002, "epoch": 0.613116026387272, "step": 790}, {"loss": 1.819, "grad_norm": 0.31554412841796875, "learning_rate": 0.0002, "epoch": 0.6208769887466046, "step": 800}, {"loss": 1.8335, "grad_norm": 0.2984226942062378, "learning_rate": 0.0002, "epoch": 0.6286379511059371, "step": 810}, {"loss": 1.7728, "grad_norm": 0.2859906554222107, "learning_rate": 0.0002, "epoch": 0.6363989134652697, "step": 820}, {"loss": 1.7408, "grad_norm": 0.2887928783893585, "learning_rate": 0.0002, "epoch": 0.6441598758246022, "step": 830}, {"loss": 1.8071, "grad_norm": 0.31287339329719543, "learning_rate": 0.0002, "epoch": 0.6519208381839348, "step": 840}, {"loss": 1.7673, "grad_norm": 0.32064181566238403, "learning_rate": 0.0002, "epoch": 0.6596818005432674, "step": 850}, {"loss": 1.7849, "grad_norm": 0.290981650352478, "learning_rate": 0.0002, "epoch": 0.6674427629026, "step": 860}, {"loss": 1.8089, "grad_norm": 0.33060121536254883, "learning_rate": 0.0002, "epoch": 0.6752037252619325, "step": 870}, {"loss": 1.7357, "grad_norm": 0.27032899856567383, "learning_rate": 0.0002, "epoch": 0.682964687621265, "step": 880}, {"loss": 1.8423, "grad_norm": 0.29031234979629517, "learning_rate": 0.0002, "epoch": 0.6907256499805976, "step": 890}, {"loss": 1.835, "grad_norm": 0.2845142185688019, "learning_rate": 0.0002, "epoch": 0.6984866123399301, "step": 900}, {"loss": 1.77, "grad_norm": 0.8638312816619873, "learning_rate": 0.0002, "epoch": 0.7062475746992627, "step": 910}, {"loss": 1.7757, "grad_norm": 0.3086668848991394, "learning_rate": 0.0002, "epoch": 0.7140085370585952, "step": 920}, {"loss": 1.7541, "grad_norm": 0.2724177837371826, "learning_rate": 0.0002, "epoch": 0.7217694994179278, "step": 930}, {"loss": 1.816, "grad_norm": 0.289559006690979, "learning_rate": 0.0002, "epoch": 0.7295304617772603, "step": 940}, {"loss": 1.7654, "grad_norm": 0.3000658452510834, "learning_rate": 0.0002, "epoch": 0.737291424136593, "step": 950}, {"loss": 1.7736, "grad_norm": 0.33544042706489563, "learning_rate": 0.0002, "epoch": 0.7450523864959255, "step": 960}, {"loss": 1.6979, "grad_norm": 0.28593236207962036, "learning_rate": 0.0002, "epoch": 0.7528133488552581, "step": 970}, {"loss": 1.8583, "grad_norm": 0.313634991645813, "learning_rate": 0.0002, "epoch": 0.7605743112145906, "step": 980}, {"loss": 1.7473, "grad_norm": 0.2949385941028595, "learning_rate": 0.0002, "epoch": 0.7683352735739232, "step": 990}, {"loss": 1.8689, "grad_norm": 0.2920108437538147, "learning_rate": 0.0002, "epoch": 0.7760962359332557, "step": 1000}, {"loss": 1.8401, "grad_norm": 0.3245100677013397, "learning_rate": 0.0002, "epoch": 0.7838571982925883, "step": 1010}, {"loss": 1.7109, "grad_norm": 0.3007619380950928, "learning_rate": 0.0002, "epoch": 0.7916181606519208, "step": 1020}, {"loss": 1.7427, "grad_norm": 0.3630852997303009, "learning_rate": 0.0002, "epoch": 0.7993791230112534, "step": 1030}, {"loss": 1.7655, "grad_norm": 0.2856379747390747, "learning_rate": 0.0002, "epoch": 0.8071400853705859, "step": 1040}, {"loss": 1.8371, "grad_norm": 0.32476478815078735, "learning_rate": 0.0002, "epoch": 0.8149010477299186, "step": 1050}, {"loss": 1.8039, "grad_norm": 0.5162565112113953, "learning_rate": 0.0002, "epoch": 0.8226620100892511, "step": 1060}, {"loss": 1.8862, "grad_norm": 0.316496342420578, "learning_rate": 0.0002, "epoch": 0.8304229724485837, "step": 1070}, {"loss": 1.8023, "grad_norm": 0.31977516412734985, "learning_rate": 0.0002, "epoch": 0.8381839348079162, "step": 1080}, {"loss": 1.8547, "grad_norm": 0.269509494304657, "learning_rate": 0.0002, "epoch": 0.8459448971672487, "step": 1090}, {"loss": 1.7811, "grad_norm": 0.31621453166007996, "learning_rate": 0.0002, "epoch": 0.8537058595265813, "step": 1100}, {"loss": 1.739, "grad_norm": 0.2946535050868988, "learning_rate": 0.0002, "epoch": 0.8614668218859138, "step": 1110}, {"loss": 1.7511, "grad_norm": 0.3088909983634949, "learning_rate": 0.0002, "epoch": 0.8692277842452464, "step": 1120}, {"loss": 1.8228, "grad_norm": 0.33033716678619385, "learning_rate": 0.0002, "epoch": 0.8769887466045789, "step": 1130}, {"loss": 1.7912, "grad_norm": 0.2954833507537842, "learning_rate": 0.0002, "epoch": 0.8847497089639115, "step": 1140}, {"loss": 1.8394, "grad_norm": 0.2950248122215271, "learning_rate": 0.0002, "epoch": 0.8925106713232441, "step": 1150}, {"loss": 1.7068, "grad_norm": 0.296661913394928, "learning_rate": 0.0002, "epoch": 0.9002716336825767, "step": 1160}, {"loss": 1.7967, "grad_norm": 0.35451310873031616, "learning_rate": 0.0002, "epoch": 0.9080325960419092, "step": 1170}, {"loss": 1.8202, "grad_norm": 0.32705947756767273, "learning_rate": 0.0002, "epoch": 0.9157935584012418, "step": 1180}, {"loss": 1.7396, "grad_norm": 0.3333960771560669, "learning_rate": 0.0002, "epoch": 0.9235545207605743, "step": 1190}, {"loss": 1.7801, "grad_norm": 0.3042232096195221, "learning_rate": 0.0002, "epoch": 0.9313154831199069, "step": 1200}, {"loss": 1.7586, "grad_norm": 0.281553715467453, "learning_rate": 0.0002, "epoch": 0.9390764454792394, "step": 1210}, {"loss": 1.7953, "grad_norm": 0.3096391558647156, "learning_rate": 0.0002, "epoch": 0.946837407838572, "step": 1220}, {"loss": 1.7401, "grad_norm": 0.2866271734237671, "learning_rate": 0.0002, "epoch": 0.9545983701979045, "step": 1230}, {"loss": 1.7211, "grad_norm": 0.28394097089767456, "learning_rate": 0.0002, "epoch": 0.962359332557237, "step": 1240}, {"loss": 1.7363, "grad_norm": 0.3249266743659973, "learning_rate": 0.0002, "epoch": 0.9701202949165697, "step": 1250}, {"loss": 1.7563, "grad_norm": 0.2896869480609894, "learning_rate": 0.0002, "epoch": 0.9778812572759022, "step": 1260}, {"loss": 1.6389, "grad_norm": 0.29224586486816406, "learning_rate": 0.0002, "epoch": 0.9856422196352348, "step": 1270}, {"loss": 1.7111, "grad_norm": 0.2820223569869995, "learning_rate": 0.0002, "epoch": 0.9934031819945673, "step": 1280}, {"eval_loss": 1.8081045150756836, "eval_runtime": 102.3056, "eval_samples_per_second": 4.956, "eval_steps_per_second": 0.626, "epoch": 0.9996119518820333, "step": 1288}, {"loss": 1.7518, "grad_norm": 0.3282551169395447, "learning_rate": 0.0002, "epoch": 1.0011641443538999, "step": 1290}, {"loss": 1.6806, "grad_norm": 0.30217495560646057, "learning_rate": 0.0002, "epoch": 1.0089251067132325, "step": 1300}, {"loss": 1.6777, "grad_norm": 0.30801767110824585, "learning_rate": 0.0002, "epoch": 1.016686069072565, "step": 1310}, {"loss": 1.7756, "grad_norm": 0.31816792488098145, "learning_rate": 0.0002, "epoch": 1.0244470314318976, "step": 1320}, {"loss": 1.6986, "grad_norm": 0.27794334292411804, "learning_rate": 0.0002, "epoch": 1.03220799379123, "step": 1330}, {"loss": 1.6931, "grad_norm": 0.3018926680088043, "learning_rate": 0.0002, "epoch": 1.0399689561505627, "step": 1340}, {"loss": 1.7033, "grad_norm": 0.3552975356578827, "learning_rate": 0.0002, "epoch": 1.0477299185098952, "step": 1350}, {"loss": 1.6782, "grad_norm": 0.32590144872665405, "learning_rate": 0.0002, "epoch": 1.0554908808692278, "step": 1360}, {"loss": 1.6479, "grad_norm": 0.3435460925102234, "learning_rate": 0.0002, "epoch": 1.0632518432285603, "step": 1370}, {"loss": 1.7451, "grad_norm": 0.35037797689437866, "learning_rate": 0.0002, "epoch": 1.071012805587893, "step": 1380}, {"loss": 1.7868, "grad_norm": 0.31398263573646545, "learning_rate": 0.0002, "epoch": 1.0787737679472253, "step": 1390}, {"loss": 1.6729, "grad_norm": 0.3134010434150696, "learning_rate": 0.0002, "epoch": 1.086534730306558, "step": 1400}, {"loss": 1.751, "grad_norm": 0.4599704444408417, "learning_rate": 0.0002, "epoch": 1.0942956926658907, "step": 1410}, {"loss": 1.6871, "grad_norm": 0.35852891206741333, "learning_rate": 0.0002, "epoch": 1.102056655025223, "step": 1420}, {"loss": 1.7083, "grad_norm": 0.35628634691238403, "learning_rate": 0.0002, "epoch": 1.1098176173845558, "step": 1430}, {"loss": 1.6166, "grad_norm": 0.3769161105155945, "learning_rate": 0.0002, "epoch": 1.1175785797438882, "step": 1440}, {"loss": 1.7344, "grad_norm": 1.3712416887283325, "learning_rate": 0.0002, "epoch": 1.1253395421032208, "step": 1450}, {"loss": 1.6542, "grad_norm": 0.38406670093536377, "learning_rate": 0.0002, "epoch": 1.1331005044625533, "step": 1460}, {"loss": 1.7104, "grad_norm": 0.3402116000652313, "learning_rate": 0.0002, "epoch": 1.140861466821886, "step": 1470}, {"loss": 1.7074, "grad_norm": 0.341189444065094, "learning_rate": 0.0002, "epoch": 1.1486224291812184, "step": 1480}, {"loss": 1.6468, "grad_norm": 0.36629995703697205, "learning_rate": 0.0002, "epoch": 1.156383391540551, "step": 1490}, {"loss": 1.6952, "grad_norm": 0.3499569296836853, "learning_rate": 0.0002, "epoch": 1.1641443538998835, "step": 1500}, {"loss": 1.6625, "grad_norm": 0.3663063943386078, "learning_rate": 0.0002, "epoch": 1.1719053162592161, "step": 1510}, {"loss": 1.7533, "grad_norm": 0.34851500391960144, "learning_rate": 0.0002, "epoch": 1.1796662786185488, "step": 1520}, {"loss": 1.6092, "grad_norm": 0.35071656107902527, "learning_rate": 0.0002, "epoch": 1.1874272409778812, "step": 1530}, {"loss": 1.7206, "grad_norm": 0.42783796787261963, "learning_rate": 0.0002, "epoch": 1.1951882033372139, "step": 1540}, {"loss": 1.7499, "grad_norm": 0.31830692291259766, "learning_rate": 0.0002, "epoch": 1.2029491656965463, "step": 1550}, {"loss": 1.7372, "grad_norm": 0.3597424626350403, "learning_rate": 0.0002, "epoch": 1.210710128055879, "step": 1560}, {"loss": 1.6386, "grad_norm": 0.35233765840530396, "learning_rate": 0.0002, "epoch": 1.2184710904152114, "step": 1570}, {"loss": 1.6766, "grad_norm": 0.35942912101745605, "learning_rate": 0.0002, "epoch": 1.226232052774544, "step": 1580}, {"loss": 1.6598, "grad_norm": 0.36159393191337585, "learning_rate": 0.0002, "epoch": 1.2339930151338767, "step": 1590}, {"loss": 1.6697, "grad_norm": 0.3328469693660736, "learning_rate": 0.0002, "epoch": 1.2417539774932091, "step": 1600}, {"loss": 1.7594, "grad_norm": 0.3089476525783539, "learning_rate": 0.0002, "epoch": 1.2495149398525418, "step": 1610}, {"loss": 1.6805, "grad_norm": 0.30947765707969666, "learning_rate": 0.0002, "epoch": 1.2572759022118742, "step": 1620}, {"loss": 1.6899, "grad_norm": 0.32154011726379395, "learning_rate": 0.0002, "epoch": 1.265036864571207, "step": 1630}, {"loss": 1.6621, "grad_norm": 0.3480297923088074, "learning_rate": 0.0002, "epoch": 1.2727978269305393, "step": 1640}, {"loss": 1.7087, "grad_norm": 0.39471694827079773, "learning_rate": 0.0002, "epoch": 1.280558789289872, "step": 1650}, {"loss": 1.7608, "grad_norm": 0.35728853940963745, "learning_rate": 0.0002, "epoch": 1.2883197516492044, "step": 1660}, {"loss": 1.7008, "grad_norm": 0.35223081707954407, "learning_rate": 0.0002, "epoch": 1.296080714008537, "step": 1670}, {"loss": 1.7253, "grad_norm": 0.3588867485523224, "learning_rate": 0.0002, "epoch": 1.3038416763678695, "step": 1680}, {"loss": 1.6505, "grad_norm": 0.3528042733669281, "learning_rate": 0.0002, "epoch": 1.3116026387272022, "step": 1690}, {"loss": 1.6945, "grad_norm": 0.35975801944732666, "learning_rate": 0.0002, "epoch": 1.3193636010865348, "step": 1700}, {"loss": 1.6631, "grad_norm": 0.36691880226135254, "learning_rate": 0.0002, "epoch": 1.3271245634458673, "step": 1710}, {"loss": 1.7593, "grad_norm": 0.3787977695465088, "learning_rate": 0.0002, "epoch": 1.3348855258052, "step": 1720}, {"loss": 1.7697, "grad_norm": 0.36614933609962463, "learning_rate": 0.0002, "epoch": 1.3426464881645324, "step": 1730}, {"loss": 1.6487, "grad_norm": 0.3484745919704437, "learning_rate": 0.0002, "epoch": 1.350407450523865, "step": 1740}, {"loss": 1.7054, "grad_norm": 0.36905673146247864, "learning_rate": 0.0002, "epoch": 1.3581684128831975, "step": 1750}, {"loss": 1.7679, "grad_norm": 0.41564738750457764, "learning_rate": 0.0002, "epoch": 1.36592937524253, "step": 1760}, {"loss": 1.6634, "grad_norm": 0.3345205783843994, "learning_rate": 0.0002, "epoch": 1.3736903376018628, "step": 1770}, {"loss": 1.7275, "grad_norm": 0.34926071763038635, "learning_rate": 0.0002, "epoch": 1.3814512999611952, "step": 1780}, {"loss": 1.685, "grad_norm": 0.42004233598709106, "learning_rate": 0.0002, "epoch": 1.3892122623205276, "step": 1790}, {"loss": 1.666, "grad_norm": 0.3576236963272095, "learning_rate": 0.0002, "epoch": 1.3969732246798603, "step": 1800}, {"loss": 1.8516, "grad_norm": 0.3586704432964325, "learning_rate": 0.0002, "epoch": 1.404734187039193, "step": 1810}, {"loss": 1.6171, "grad_norm": 0.3943439722061157, "learning_rate": 0.0002, "epoch": 1.4124951493985254, "step": 1820}, {"loss": 1.6865, "grad_norm": 0.3484877049922943, "learning_rate": 0.0002, "epoch": 1.420256111757858, "step": 1830}, {"loss": 1.7205, "grad_norm": 0.3344518840312958, "learning_rate": 0.0002, "epoch": 1.4280170741171905, "step": 1840}, {"loss": 1.6999, "grad_norm": 0.4345698356628418, "learning_rate": 0.0002, "epoch": 1.4357780364765231, "step": 1850}, {"loss": 1.6855, "grad_norm": 0.5525162220001221, "learning_rate": 0.0002, "epoch": 1.4435389988358556, "step": 1860}, {"loss": 1.7143, "grad_norm": 0.37194496393203735, "learning_rate": 0.0002, "epoch": 1.4512999611951882, "step": 1870}, {"loss": 1.7623, "grad_norm": 0.34570157527923584, "learning_rate": 0.0002, "epoch": 1.4590609235545209, "step": 1880}, {"loss": 1.7, "grad_norm": 0.3512282073497772, "learning_rate": 0.0002, "epoch": 1.4668218859138533, "step": 1890}, {"loss": 1.7225, "grad_norm": 0.3443922996520996, "learning_rate": 0.0002, "epoch": 1.4745828482731858, "step": 1900}, {"loss": 1.7393, "grad_norm": 0.3812018036842346, "learning_rate": 0.0002, "epoch": 1.4823438106325184, "step": 1910}, {"loss": 1.7277, "grad_norm": 0.39263492822647095, "learning_rate": 0.0002, "epoch": 1.490104772991851, "step": 1920}, {"loss": 1.6829, "grad_norm": 0.3146156072616577, "learning_rate": 0.0002, "epoch": 1.4978657353511835, "step": 1930}, {"loss": 1.6881, "grad_norm": 0.3653988540172577, "learning_rate": 0.0002, "epoch": 1.505626697710516, "step": 1940}, {"loss": 1.7064, "grad_norm": 0.3966596722602844, "learning_rate": 0.0002, "epoch": 1.5133876600698488, "step": 1950}, {"loss": 1.6942, "grad_norm": 0.3441697359085083, "learning_rate": 0.0002, "epoch": 1.5211486224291813, "step": 1960}, {"loss": 1.7175, "grad_norm": 0.3328564465045929, "learning_rate": 0.0002, "epoch": 1.5289095847885137, "step": 1970}, {"loss": 1.7394, "grad_norm": 0.34068772196769714, "learning_rate": 0.0002, "epoch": 1.5366705471478463, "step": 1980}, {"loss": 1.7016, "grad_norm": 0.3559795916080475, "learning_rate": 0.0002, "epoch": 1.544431509507179, "step": 1990}, {"loss": 1.7102, "grad_norm": 0.37888768315315247, "learning_rate": 0.0002, "epoch": 1.5521924718665114, "step": 2000}, {"loss": 1.7094, "grad_norm": 0.36128363013267517, "learning_rate": 0.0002, "epoch": 1.5599534342258439, "step": 2010}, {"loss": 1.6407, "grad_norm": 0.3643714487552643, "learning_rate": 0.0002, "epoch": 1.5677143965851765, "step": 2020}, {"loss": 1.6777, "grad_norm": 0.3863612115383148, "learning_rate": 0.0002, "epoch": 1.5754753589445092, "step": 2030}, {"loss": 1.6575, "grad_norm": 0.32831457257270813, "learning_rate": 0.0002, "epoch": 1.5832363213038416, "step": 2040}, {"loss": 1.7404, "grad_norm": 0.36098113656044006, "learning_rate": 0.0002, "epoch": 1.5909972836631743, "step": 2050}, {"loss": 1.7065, "grad_norm": 1.1079334020614624, "learning_rate": 0.0002, "epoch": 1.598758246022507, "step": 2060}, {"loss": 1.6824, "grad_norm": 0.35615381598472595, "learning_rate": 0.0002, "epoch": 1.6065192083818394, "step": 2070}, {"loss": 1.7262, "grad_norm": 0.369711309671402, "learning_rate": 0.0002, "epoch": 1.6142801707411718, "step": 2080}, {"loss": 1.6995, "grad_norm": 0.390658438205719, "learning_rate": 0.0002, "epoch": 1.6220411331005045, "step": 2090}, {"loss": 1.6996, "grad_norm": 0.3422999382019043, "learning_rate": 0.0002, "epoch": 1.6298020954598371, "step": 2100}, {"loss": 1.7135, "grad_norm": 0.372475266456604, "learning_rate": 0.0002, "epoch": 1.6375630578191696, "step": 2110}, {"loss": 1.7216, "grad_norm": 0.35660576820373535, "learning_rate": 0.0002, "epoch": 1.645324020178502, "step": 2120}, {"loss": 1.6991, "grad_norm": 0.35754942893981934, "learning_rate": 0.0002, "epoch": 1.6530849825378346, "step": 2130}, {"loss": 1.6779, "grad_norm": 0.34572410583496094, "learning_rate": 0.0002, "epoch": 1.6608459448971673, "step": 2140}, {"loss": 1.6707, "grad_norm": 0.42059701681137085, "learning_rate": 0.0002, "epoch": 1.6686069072564997, "step": 2150}, {"loss": 1.6782, "grad_norm": 0.35200759768486023, "learning_rate": 0.0002, "epoch": 1.6763678696158324, "step": 2160}, {"loss": 1.6869, "grad_norm": 0.3704029321670532, "learning_rate": 0.0002, "epoch": 1.684128831975165, "step": 2170}, {"loss": 1.7192, "grad_norm": 0.40450501441955566, "learning_rate": 0.0002, "epoch": 1.6918897943344975, "step": 2180}, {"loss": 1.6228, "grad_norm": 0.362966924905777, "learning_rate": 0.0002, "epoch": 1.69965075669383, "step": 2190}, {"loss": 1.6935, "grad_norm": 0.36586204171180725, "learning_rate": 0.0002, "epoch": 1.7074117190531626, "step": 2200}, {"loss": 1.6088, "grad_norm": 0.3295372426509857, "learning_rate": 0.0002, "epoch": 1.7151726814124952, "step": 2210}, {"loss": 1.7844, "grad_norm": 0.3892575800418854, "learning_rate": 0.0002, "epoch": 1.7229336437718277, "step": 2220}, {"loss": 1.7805, "grad_norm": 0.34712135791778564, "learning_rate": 0.0002, "epoch": 1.73069460613116, "step": 2230}, {"loss": 1.7353, "grad_norm": 0.34801796078681946, "learning_rate": 0.0002, "epoch": 1.738455568490493, "step": 2240}, {"loss": 1.7009, "grad_norm": 0.3822397291660309, "learning_rate": 0.0002, "epoch": 1.7462165308498254, "step": 2250}, {"loss": 1.6546, "grad_norm": 0.38933250308036804, "learning_rate": 0.0002, "epoch": 1.7539774932091579, "step": 2260}, {"loss": 1.7245, "grad_norm": 0.3798373341560364, "learning_rate": 0.0002, "epoch": 1.7617384555684905, "step": 2270}, {"loss": 1.6508, "grad_norm": 0.35151317715644836, "learning_rate": 0.0002, "epoch": 1.7694994179278232, "step": 2280}, {"loss": 1.6894, "grad_norm": 0.44981494545936584, "learning_rate": 0.0002, "epoch": 1.7772603802871556, "step": 2290}, {"loss": 1.7271, "grad_norm": 0.3992624580860138, "learning_rate": 0.0002, "epoch": 1.785021342646488, "step": 2300}, {"loss": 1.7252, "grad_norm": 0.3772512376308441, "learning_rate": 0.0002, "epoch": 1.7927823050058207, "step": 2310}, {"loss": 1.7057, "grad_norm": 0.3511589467525482, "learning_rate": 0.0002, "epoch": 1.8005432673651534, "step": 2320}, {"loss": 1.764, "grad_norm": 0.3805285394191742, "learning_rate": 0.0002, "epoch": 1.8083042297244858, "step": 2330}, {"loss": 1.6986, "grad_norm": 0.3792071044445038, "learning_rate": 0.0002, "epoch": 1.8160651920838184, "step": 2340}, {"loss": 1.7759, "grad_norm": 0.36430829763412476, "learning_rate": 0.0002, "epoch": 1.823826154443151, "step": 2350}, {"loss": 1.6773, "grad_norm": 0.36502477526664734, "learning_rate": 0.0002, "epoch": 1.8315871168024835, "step": 2360}, {"loss": 1.8072, "grad_norm": 0.35015153884887695, "learning_rate": 0.0002, "epoch": 1.839348079161816, "step": 2370}, {"loss": 1.7734, "grad_norm": 0.3710903823375702, "learning_rate": 0.0002, "epoch": 1.8471090415211486, "step": 2380}, {"loss": 1.6737, "grad_norm": 0.3542828857898712, "learning_rate": 0.0002, "epoch": 1.8548700038804813, "step": 2390}, {"loss": 1.6783, "grad_norm": 0.35467568039894104, "learning_rate": 0.0002, "epoch": 1.8626309662398137, "step": 2400}, {"loss": 1.7773, "grad_norm": 0.3638560473918915, "learning_rate": 0.0002, "epoch": 1.8703919285991462, "step": 2410}, {"loss": 1.7019, "grad_norm": 0.3823298215866089, "learning_rate": 0.0002, "epoch": 1.8781528909584788, "step": 2420}, {"loss": 1.6935, "grad_norm": 0.3926416337490082, "learning_rate": 0.0002, "epoch": 1.8859138533178115, "step": 2430}, {"loss": 1.71, "grad_norm": 0.3608079254627228, "learning_rate": 0.0002, "epoch": 1.893674815677144, "step": 2440}, {"loss": 1.6654, "grad_norm": 0.3426613509654999, "learning_rate": 0.0002, "epoch": 1.9014357780364766, "step": 2450}, {"loss": 1.6892, "grad_norm": 0.3522338569164276, "learning_rate": 0.0002, "epoch": 1.9091967403958092, "step": 2460}, {"loss": 1.7307, "grad_norm": 0.3608049154281616, "learning_rate": 0.0002, "epoch": 1.9169577027551417, "step": 2470}, {"loss": 1.6823, "grad_norm": 0.3849755525588989, "learning_rate": 0.0002, "epoch": 1.924718665114474, "step": 2480}, {"loss": 1.7518, "grad_norm": 0.4154011011123657, "learning_rate": 0.0002, "epoch": 1.9324796274738067, "step": 2490}, {"loss": 1.7381, "grad_norm": 0.3602796792984009, "learning_rate": 0.0002, "epoch": 1.9402405898331394, "step": 2500}, {"loss": 1.7843, "grad_norm": 0.3702992796897888, "learning_rate": 0.0002, "epoch": 1.9480015521924718, "step": 2510}, {"loss": 1.6669, "grad_norm": 0.3657735288143158, "learning_rate": 0.0002, "epoch": 1.9557625145518043, "step": 2520}, {"loss": 1.5964, "grad_norm": 0.41031739115715027, "learning_rate": 0.0002, "epoch": 1.963523476911137, "step": 2530}, {"loss": 1.6745, "grad_norm": 0.34578680992126465, "learning_rate": 0.0002, "epoch": 1.9712844392704696, "step": 2540}, {"loss": 1.723, "grad_norm": 0.3361521065235138, "learning_rate": 0.0002, "epoch": 1.979045401629802, "step": 2550}, {"loss": 1.6868, "grad_norm": 0.34342363476753235, "learning_rate": 0.0002, "epoch": 1.9868063639891347, "step": 2560}, {"loss": 1.6577, "grad_norm": 0.32954007387161255, "learning_rate": 0.0002, "epoch": 1.9945673263484673, "step": 2570}, {"eval_loss": 1.8068748712539673, "eval_runtime": 105.5885, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.606, "epoch": 2.0, "step": 2577}, {"loss": 1.634, "grad_norm": 0.336302250623703, "learning_rate": 0.0002, "epoch": 2.0023282887077998, "step": 2580}, {"loss": 1.612, "grad_norm": 0.3627048432826996, "learning_rate": 0.0002, "epoch": 2.010089251067132, "step": 2590}, {"loss": 1.4908, "grad_norm": 0.38406702876091003, "learning_rate": 0.0002, "epoch": 2.017850213426465, "step": 2600}, {"loss": 1.5368, "grad_norm": 0.5326781272888184, "learning_rate": 0.0002, "epoch": 2.0256111757857975, "step": 2610}, {"loss": 1.5727, "grad_norm": 0.4774554967880249, "learning_rate": 0.0002, "epoch": 2.03337213814513, "step": 2620}, {"loss": 1.5422, "grad_norm": 0.4251810312271118, "learning_rate": 0.0002, "epoch": 2.0411331005044624, "step": 2630}, {"loss": 1.5152, "grad_norm": 0.4693007171154022, "learning_rate": 0.0002, "epoch": 2.0488940628637953, "step": 2640}, {"loss": 1.6137, "grad_norm": 0.46371519565582275, "learning_rate": 0.0002, "epoch": 2.0566550252231277, "step": 2650}, {"loss": 1.6304, "grad_norm": 0.46652570366859436, "learning_rate": 0.0002, "epoch": 2.06441598758246, "step": 2660}, {"loss": 1.6022, "grad_norm": 0.45200315117836, "learning_rate": 0.0002, "epoch": 2.0721769499417926, "step": 2670}, {"loss": 1.5358, "grad_norm": 0.42905205488204956, "learning_rate": 0.0002, "epoch": 2.0799379123011255, "step": 2680}, {"loss": 1.5401, "grad_norm": 0.44509148597717285, "learning_rate": 0.0002, "epoch": 2.087698874660458, "step": 2690}, {"loss": 1.5303, "grad_norm": 0.4445319175720215, "learning_rate": 0.0002, "epoch": 2.0954598370197903, "step": 2700}, {"loss": 1.5701, "grad_norm": 0.46825504302978516, "learning_rate": 0.0002, "epoch": 2.103220799379123, "step": 2710}, {"loss": 1.5751, "grad_norm": 0.4623856842517853, "learning_rate": 0.0002, "epoch": 2.1109817617384556, "step": 2720}, {"loss": 1.5601, "grad_norm": 0.4833452105522156, "learning_rate": 0.0002, "epoch": 2.118742724097788, "step": 2730}, {"loss": 1.5997, "grad_norm": 0.4582686722278595, "learning_rate": 0.0002, "epoch": 2.1265036864571205, "step": 2740}, {"loss": 1.5801, "grad_norm": 0.47587934136390686, "learning_rate": 0.0002, "epoch": 2.1342646488164534, "step": 2750}, {"loss": 1.594, "grad_norm": 0.4602217972278595, "learning_rate": 0.0002, "epoch": 2.142025611175786, "step": 2760}, {"loss": 1.5271, "grad_norm": 0.47501352429389954, "learning_rate": 0.0002, "epoch": 2.1497865735351183, "step": 2770}, {"loss": 1.4862, "grad_norm": 0.5078499913215637, "learning_rate": 0.0002, "epoch": 2.1575475358944507, "step": 2780}, {"loss": 1.6236, "grad_norm": 0.497704416513443, "learning_rate": 0.0002, "epoch": 2.1653084982537836, "step": 2790}, {"loss": 1.5597, "grad_norm": 0.5435971617698669, "learning_rate": 0.0002, "epoch": 2.173069460613116, "step": 2800}, {"loss": 1.5926, "grad_norm": 0.5172356367111206, "learning_rate": 0.0002, "epoch": 2.1808304229724484, "step": 2810}, {"loss": 1.5202, "grad_norm": 0.44063422083854675, "learning_rate": 0.0002, "epoch": 2.1885913853317813, "step": 2820}, {"loss": 1.6041, "grad_norm": 0.5079569220542908, "learning_rate": 0.0002, "epoch": 2.1963523476911138, "step": 2830}, {"loss": 1.5915, "grad_norm": 0.45658132433891296, "learning_rate": 0.0002, "epoch": 2.204113310050446, "step": 2840}, {"loss": 1.5546, "grad_norm": 0.5103023648262024, "learning_rate": 0.0002, "epoch": 2.2118742724097786, "step": 2850}, {"loss": 1.6197, "grad_norm": 0.4882226288318634, "learning_rate": 0.0002, "epoch": 2.2196352347691115, "step": 2860}, {"loss": 1.5996, "grad_norm": 0.5087296962738037, "learning_rate": 0.0002, "epoch": 2.227396197128444, "step": 2870}, {"loss": 1.5451, "grad_norm": 0.45293712615966797, "learning_rate": 0.0002, "epoch": 2.2351571594877764, "step": 2880}, {"loss": 1.6214, "grad_norm": 0.5120379328727722, "learning_rate": 0.0002, "epoch": 2.242918121847109, "step": 2890}, {"loss": 1.5273, "grad_norm": 0.47126415371894836, "learning_rate": 0.0002, "epoch": 2.2506790842064417, "step": 2900}, {"loss": 1.612, "grad_norm": 0.44005846977233887, "learning_rate": 0.0002, "epoch": 2.258440046565774, "step": 2910}, {"loss": 1.6023, "grad_norm": 0.46476176381111145, "learning_rate": 0.0002, "epoch": 2.2662010089251066, "step": 2920}, {"loss": 1.6417, "grad_norm": 0.48051515221595764, "learning_rate": 0.0002, "epoch": 2.2739619712844394, "step": 2930}, {"loss": 1.587, "grad_norm": 0.480069637298584, "learning_rate": 0.0002, "epoch": 2.281722933643772, "step": 2940}, {"loss": 1.5747, "grad_norm": 0.5122102499008179, "learning_rate": 0.0002, "epoch": 2.2894838960031043, "step": 2950}, {"loss": 1.5183, "grad_norm": 0.48879891633987427, "learning_rate": 0.0002, "epoch": 2.2972448583624367, "step": 2960}, {"loss": 1.5483, "grad_norm": 0.4973136782646179, "learning_rate": 0.0002, "epoch": 2.3050058207217696, "step": 2970}, {"loss": 1.677, "grad_norm": 0.5522695183753967, "learning_rate": 0.0002, "epoch": 2.312766783081102, "step": 2980}, {"loss": 1.5946, "grad_norm": 0.5220217704772949, "learning_rate": 0.0002, "epoch": 2.3205277454404345, "step": 2990}, {"loss": 1.6299, "grad_norm": 0.4978662431240082, "learning_rate": 0.0002, "epoch": 2.328288707799767, "step": 3000}, {"loss": 1.5498, "grad_norm": 0.554053544998169, "learning_rate": 0.0002, "epoch": 2.3360496701591, "step": 3010}, {"loss": 1.5356, "grad_norm": 0.4703886806964874, "learning_rate": 0.0002, "epoch": 2.3438106325184322, "step": 3020}, {"loss": 1.5418, "grad_norm": 0.5074123740196228, "learning_rate": 0.0002, "epoch": 2.3515715948777647, "step": 3030}, {"loss": 1.6873, "grad_norm": 0.5088278651237488, "learning_rate": 0.0002, "epoch": 2.3593325572370976, "step": 3040}, {"loss": 1.5249, "grad_norm": 0.4752114415168762, "learning_rate": 0.0002, "epoch": 2.36709351959643, "step": 3050}, {"loss": 1.5353, "grad_norm": 0.5121659636497498, "learning_rate": 0.0002, "epoch": 2.3748544819557624, "step": 3060}, {"loss": 1.6426, "grad_norm": 0.48649218678474426, "learning_rate": 0.0002, "epoch": 2.3826154443150953, "step": 3070}, {"loss": 1.6136, "grad_norm": 0.5209488868713379, "learning_rate": 0.0002, "epoch": 2.3903764066744277, "step": 3080}, {"loss": 1.597, "grad_norm": 0.5110517740249634, "learning_rate": 0.0002, "epoch": 2.39813736903376, "step": 3090}, {"loss": 1.5773, "grad_norm": 0.5609337091445923, "learning_rate": 0.0002, "epoch": 2.4058983313930926, "step": 3100}, {"loss": 1.5438, "grad_norm": 0.5191826224327087, "learning_rate": 0.0002, "epoch": 2.4136592937524255, "step": 3110}, {"loss": 1.6347, "grad_norm": 0.4876069724559784, "learning_rate": 0.0002, "epoch": 2.421420256111758, "step": 3120}, {"loss": 1.5565, "grad_norm": 0.4713933765888214, "learning_rate": 0.0002, "epoch": 2.4291812184710904, "step": 3130}, {"loss": 1.6388, "grad_norm": 0.5102227330207825, "learning_rate": 0.0002, "epoch": 2.436942180830423, "step": 3140}, {"loss": 1.5667, "grad_norm": 0.44546666741371155, "learning_rate": 0.0002, "epoch": 2.4447031431897557, "step": 3150}, {"loss": 1.5973, "grad_norm": 0.5167558193206787, "learning_rate": 0.0002, "epoch": 2.452464105549088, "step": 3160}, {"loss": 1.5673, "grad_norm": 0.5226958990097046, "learning_rate": 0.0002, "epoch": 2.4602250679084205, "step": 3170}, {"loss": 1.5758, "grad_norm": 0.4751799702644348, "learning_rate": 0.0002, "epoch": 2.4679860302677534, "step": 3180}, {"loss": 1.6234, "grad_norm": 0.4744729697704315, "learning_rate": 0.0002, "epoch": 2.475746992627086, "step": 3190}, {"loss": 1.5661, "grad_norm": 0.5203230381011963, "learning_rate": 0.0002, "epoch": 2.4835079549864183, "step": 3200}, {"loss": 1.493, "grad_norm": 0.47209781408309937, "learning_rate": 0.0002, "epoch": 2.4912689173457507, "step": 3210}, {"loss": 1.6415, "grad_norm": 0.5241674780845642, "learning_rate": 0.0002, "epoch": 2.4990298797050836, "step": 3220}, {"loss": 1.6324, "grad_norm": 0.5152244567871094, "learning_rate": 0.0002, "epoch": 2.506790842064416, "step": 3230}, {"loss": 1.6248, "grad_norm": 0.5216741561889648, "learning_rate": 0.0002, "epoch": 2.5145518044237485, "step": 3240}, {"loss": 1.5668, "grad_norm": 0.4953259527683258, "learning_rate": 0.0002, "epoch": 2.522312766783081, "step": 3250}, {"loss": 1.666, "grad_norm": 0.5973829030990601, "learning_rate": 0.0002, "epoch": 2.530073729142414, "step": 3260}, {"loss": 1.5295, "grad_norm": 0.48804202675819397, "learning_rate": 0.0002, "epoch": 2.5378346915017462, "step": 3270}, {"loss": 1.4954, "grad_norm": 0.5334644317626953, "learning_rate": 0.0002, "epoch": 2.5455956538610787, "step": 3280}, {"loss": 1.5814, "grad_norm": 0.46873313188552856, "learning_rate": 0.0002, "epoch": 2.5533566162204115, "step": 3290}, {"loss": 1.5362, "grad_norm": 0.4282589554786682, "learning_rate": 0.0002, "epoch": 2.561117578579744, "step": 3300}, {"loss": 1.6278, "grad_norm": 0.4848293960094452, "learning_rate": 0.0002, "epoch": 2.5688785409390764, "step": 3310}, {"loss": 1.6308, "grad_norm": 0.5093745589256287, "learning_rate": 0.0002, "epoch": 2.576639503298409, "step": 3320}, {"loss": 1.6375, "grad_norm": 0.5084842443466187, "learning_rate": 0.0002, "epoch": 2.5844004656577413, "step": 3330}, {"loss": 1.6168, "grad_norm": 0.4696281850337982, "learning_rate": 0.0002, "epoch": 2.592161428017074, "step": 3340}, {"loss": 1.5359, "grad_norm": 0.5767765641212463, "learning_rate": 0.0002, "epoch": 2.5999223903764066, "step": 3350}, {"loss": 1.6097, "grad_norm": 0.47300875186920166, "learning_rate": 0.0002, "epoch": 2.607683352735739, "step": 3360}, {"loss": 1.6138, "grad_norm": 0.4809158146381378, "learning_rate": 0.0002, "epoch": 2.615444315095072, "step": 3370}, {"loss": 1.4952, "grad_norm": 0.5141063928604126, "learning_rate": 0.0002, "epoch": 2.6232052774544043, "step": 3380}, {"loss": 1.5784, "grad_norm": 0.4832935035228729, "learning_rate": 0.0002, "epoch": 2.630966239813737, "step": 3390}, {"loss": 1.5796, "grad_norm": 0.5044625401496887, "learning_rate": 0.0002, "epoch": 2.6387272021730697, "step": 3400}, {"loss": 1.6202, "grad_norm": 0.5287680625915527, "learning_rate": 0.0002, "epoch": 2.646488164532402, "step": 3410}, {"loss": 1.5423, "grad_norm": 0.5306379795074463, "learning_rate": 0.0002, "epoch": 2.6542491268917345, "step": 3420}, {"loss": 1.5264, "grad_norm": 0.5849291682243347, "learning_rate": 0.0002, "epoch": 2.662010089251067, "step": 3430}, {"loss": 1.5937, "grad_norm": 0.7951080799102783, "learning_rate": 0.0002, "epoch": 2.6697710516104, "step": 3440}, {"loss": 1.5791, "grad_norm": 0.48087653517723083, "learning_rate": 0.0002, "epoch": 2.6775320139697323, "step": 3450}, {"loss": 1.6769, "grad_norm": 0.5396431684494019, "learning_rate": 0.0002, "epoch": 2.6852929763290647, "step": 3460}, {"loss": 1.606, "grad_norm": 0.5481634736061096, "learning_rate": 0.0002, "epoch": 2.693053938688397, "step": 3470}, {"loss": 1.6436, "grad_norm": 0.5068731307983398, "learning_rate": 0.0002, "epoch": 2.70081490104773, "step": 3480}, {"loss": 1.5738, "grad_norm": 0.5759826898574829, "learning_rate": 0.0002, "epoch": 2.7085758634070625, "step": 3490}, {"loss": 1.596, "grad_norm": 0.7253932952880859, "learning_rate": 0.0002, "epoch": 2.716336825766395, "step": 3500}, {"loss": 1.5791, "grad_norm": 0.527745246887207, "learning_rate": 0.0002, "epoch": 2.724097788125728, "step": 3510}, {"loss": 1.5874, "grad_norm": 0.5279242396354675, "learning_rate": 0.0002, "epoch": 2.73185875048506, "step": 3520}, {"loss": 1.6768, "grad_norm": 0.5047839283943176, "learning_rate": 0.0002, "epoch": 2.7396197128443927, "step": 3530}, {"loss": 1.5517, "grad_norm": 0.5430883169174194, "learning_rate": 0.0002, "epoch": 2.7473806752037255, "step": 3540}, {"loss": 1.5624, "grad_norm": 0.4496723711490631, "learning_rate": 0.0002, "epoch": 2.755141637563058, "step": 3550}, {"loss": 1.5789, "grad_norm": 0.5063338875770569, "learning_rate": 0.0002, "epoch": 2.7629025999223904, "step": 3560}, {"loss": 1.52, "grad_norm": 0.4619026780128479, "learning_rate": 0.0002, "epoch": 2.770663562281723, "step": 3570}, {"loss": 1.5793, "grad_norm": 0.4753304123878479, "learning_rate": 0.0002, "epoch": 2.7784245246410553, "step": 3580}, {"loss": 1.5715, "grad_norm": 0.5422708988189697, "learning_rate": 0.0002, "epoch": 2.786185487000388, "step": 3590}, {"loss": 1.5926, "grad_norm": 0.4756578803062439, "learning_rate": 0.0002, "epoch": 2.7939464493597206, "step": 3600}, {"loss": 1.5358, "grad_norm": 0.5057567358016968, "learning_rate": 0.0002, "epoch": 2.801707411719053, "step": 3610}, {"loss": 1.6131, "grad_norm": 0.5410919785499573, "learning_rate": 0.0002, "epoch": 2.809468374078386, "step": 3620}, {"loss": 1.5573, "grad_norm": 0.4958136975765228, "learning_rate": 0.0002, "epoch": 2.8172293364377183, "step": 3630}, {"loss": 1.6324, "grad_norm": 0.454527348279953, "learning_rate": 0.0002, "epoch": 2.8249902987970508, "step": 3640}, {"loss": 1.5582, "grad_norm": 0.5092706084251404, "learning_rate": 0.0002, "epoch": 2.8327512611563836, "step": 3650}, {"loss": 1.5893, "grad_norm": 0.5314022302627563, "learning_rate": 0.0002, "epoch": 2.840512223515716, "step": 3660}, {"loss": 1.588, "grad_norm": 0.5028239488601685, "learning_rate": 0.0002, "epoch": 2.8482731858750485, "step": 3670}, {"loss": 1.5751, "grad_norm": 0.5127444863319397, "learning_rate": 0.0002, "epoch": 2.856034148234381, "step": 3680}, {"loss": 1.6018, "grad_norm": 0.5045645236968994, "learning_rate": 0.0002, "epoch": 2.8637951105937134, "step": 3690}, {"loss": 1.5788, "grad_norm": 0.5560781955718994, "learning_rate": 0.0002, "epoch": 2.8715560729530463, "step": 3700}, {"loss": 1.5988, "grad_norm": 0.5177600383758545, "learning_rate": 0.0002, "epoch": 2.8793170353123787, "step": 3710}, {"loss": 1.6009, "grad_norm": 0.45830899477005005, "learning_rate": 0.0002, "epoch": 2.887077997671711, "step": 3720}, {"loss": 1.6344, "grad_norm": 0.4828629195690155, "learning_rate": 0.0002, "epoch": 2.894838960031044, "step": 3730}, {"loss": 1.6758, "grad_norm": 0.48241183161735535, "learning_rate": 0.0002, "epoch": 2.9025999223903765, "step": 3740}, {"loss": 1.5649, "grad_norm": 0.4909592568874359, "learning_rate": 0.0002, "epoch": 2.910360884749709, "step": 3750}, {"loss": 1.4927, "grad_norm": 0.44677025079727173, "learning_rate": 0.0002, "epoch": 2.9181218471090418, "step": 3760}, {"loss": 1.5067, "grad_norm": 0.4928834140300751, "learning_rate": 0.0002, "epoch": 2.925882809468374, "step": 3770}, {"loss": 1.5843, "grad_norm": 0.5673553347587585, "learning_rate": 0.0002, "epoch": 2.9336437718277066, "step": 3780}, {"loss": 1.5566, "grad_norm": 0.548190712928772, "learning_rate": 0.0002, "epoch": 2.941404734187039, "step": 3790}, {"loss": 1.5892, "grad_norm": 0.48979803919792175, "learning_rate": 0.0002, "epoch": 2.9491656965463715, "step": 3800}, {"loss": 1.5589, "grad_norm": 0.533191978931427, "learning_rate": 0.0002, "epoch": 2.9569266589057044, "step": 3810}, {"loss": 1.584, "grad_norm": 0.5362946391105652, "learning_rate": 0.0002, "epoch": 2.964687621265037, "step": 3820}, {"loss": 1.6602, "grad_norm": 0.4724906384944916, "learning_rate": 0.0002, "epoch": 2.9724485836243693, "step": 3830}, {"loss": 1.5834, "grad_norm": 0.5468461513519287, "learning_rate": 0.0002, "epoch": 2.980209545983702, "step": 3840}, {"loss": 1.6316, "grad_norm": 0.4697108864784241, "learning_rate": 0.0002, "epoch": 2.9879705083430346, "step": 3850}, {"loss": 1.6312, "grad_norm": 0.4780906140804291, "learning_rate": 0.0002, "epoch": 2.995731470702367, "step": 3860}, {"eval_loss": 1.8472607135772705, "eval_runtime": 106.5541, "eval_samples_per_second": 4.758, "eval_steps_per_second": 0.601, "epoch": 2.9996119518820334, "step": 3865}, {"loss": 1.4983, "grad_norm": 0.5645653605461121, "learning_rate": 0.0002, "epoch": 3.0034924330616994, "step": 3870}, {"loss": 1.4334, "grad_norm": 0.6457151174545288, "learning_rate": 0.0002, "epoch": 3.0112533954210323, "step": 3880}, {"loss": 1.3899, "grad_norm": 0.583838164806366, "learning_rate": 0.0002, "epoch": 3.0190143577803648, "step": 3890}, {"loss": 1.3258, "grad_norm": 0.6819260120391846, "learning_rate": 0.0002, "epoch": 3.026775320139697, "step": 3900}, {"loss": 1.3458, "grad_norm": 0.6692903637886047, "learning_rate": 0.0002, "epoch": 3.03453628249903, "step": 3910}, {"loss": 1.4356, "grad_norm": 0.6101024746894836, "learning_rate": 0.0002, "epoch": 3.0422972448583625, "step": 3920}, {"loss": 1.394, "grad_norm": 0.7014093399047852, "learning_rate": 0.0002, "epoch": 3.050058207217695, "step": 3930}, {"loss": 1.3885, "grad_norm": 0.7380381226539612, "learning_rate": 0.0002, "epoch": 3.0578191695770274, "step": 3940}, {"loss": 1.4206, "grad_norm": 0.6607900857925415, "learning_rate": 0.0002, "epoch": 3.0655801319363603, "step": 3950}, {"loss": 1.4293, "grad_norm": 0.735263466835022, "learning_rate": 0.0002, "epoch": 3.0733410942956927, "step": 3960}, {"loss": 1.3966, "grad_norm": 0.6788513660430908, "learning_rate": 0.0002, "epoch": 3.081102056655025, "step": 3970}, {"loss": 1.3435, "grad_norm": 0.6347652673721313, "learning_rate": 0.0002, "epoch": 3.088863019014358, "step": 3980}, {"loss": 1.4518, "grad_norm": 0.7056642770767212, "learning_rate": 0.0002, "epoch": 3.0966239813736904, "step": 3990}, {"loss": 1.4474, "grad_norm": 0.6387075185775757, "learning_rate": 0.0002, "epoch": 3.104384943733023, "step": 4000}, {"loss": 1.3833, "grad_norm": 0.6701116561889648, "learning_rate": 0.0002, "epoch": 3.1121459060923553, "step": 4010}, {"loss": 1.404, "grad_norm": 0.7558449506759644, "learning_rate": 0.0002, "epoch": 3.119906868451688, "step": 4020}, {"loss": 1.3294, "grad_norm": 0.6612881422042847, "learning_rate": 0.0002, "epoch": 3.1276678308110206, "step": 4030}, {"loss": 1.439, "grad_norm": 0.7474587559700012, "learning_rate": 0.0002, "epoch": 3.135428793170353, "step": 4040}, {"loss": 1.4616, "grad_norm": 0.7292373776435852, "learning_rate": 0.0002, "epoch": 3.1431897555296855, "step": 4050}, {"loss": 1.3908, "grad_norm": 0.7432886958122253, "learning_rate": 0.0002, "epoch": 3.1509507178890184, "step": 4060}, {"loss": 1.4214, "grad_norm": 0.6366098523139954, "learning_rate": 0.0002, "epoch": 3.158711680248351, "step": 4070}, {"loss": 1.5044, "grad_norm": 0.6837611794471741, "learning_rate": 0.0002, "epoch": 3.1664726426076832, "step": 4080}, {"loss": 1.4332, "grad_norm": 0.7194393277168274, "learning_rate": 0.0002, "epoch": 3.174233604967016, "step": 4090}, {"loss": 1.3628, "grad_norm": 0.6963607668876648, "learning_rate": 0.0002, "epoch": 3.1819945673263486, "step": 4100}, {"loss": 1.4127, "grad_norm": 0.6404902935028076, "learning_rate": 0.0002, "epoch": 3.189755529685681, "step": 4110}, {"loss": 1.4394, "grad_norm": 0.7172070741653442, "learning_rate": 0.0002, "epoch": 3.1975164920450134, "step": 4120}, {"loss": 1.4658, "grad_norm": 0.6577759385108948, "learning_rate": 0.0002, "epoch": 3.2052774544043463, "step": 4130}, {"loss": 1.4019, "grad_norm": 0.6658480167388916, "learning_rate": 0.0002, "epoch": 3.2130384167636787, "step": 4140}, {"loss": 1.4348, "grad_norm": 0.6771699786186218, "learning_rate": 0.0002, "epoch": 3.220799379123011, "step": 4150}, {"loss": 1.4736, "grad_norm": 0.699035108089447, "learning_rate": 0.0002, "epoch": 3.2285603414823436, "step": 4160}, {"loss": 1.4096, "grad_norm": 0.7218514680862427, "learning_rate": 0.0002, "epoch": 3.2363213038416765, "step": 4170}, {"loss": 1.3637, "grad_norm": 0.6270631551742554, "learning_rate": 0.0002, "epoch": 3.244082266201009, "step": 4180}, {"loss": 1.4076, "grad_norm": 0.6828921437263489, "learning_rate": 0.0002, "epoch": 3.2518432285603414, "step": 4190}, {"loss": 1.4663, "grad_norm": 0.6005498170852661, "learning_rate": 0.0002, "epoch": 3.2596041909196742, "step": 4200}, {"loss": 1.4798, "grad_norm": 0.6974790692329407, "learning_rate": 0.0002, "epoch": 3.2673651532790067, "step": 4210}, {"loss": 1.5012, "grad_norm": 0.7269543409347534, "learning_rate": 0.0002, "epoch": 3.275126115638339, "step": 4220}, {"loss": 1.3848, "grad_norm": 0.6728787422180176, "learning_rate": 0.0002, "epoch": 3.2828870779976715, "step": 4230}, {"loss": 1.4112, "grad_norm": 0.676972508430481, "learning_rate": 0.0002, "epoch": 3.2906480403570044, "step": 4240}, {"loss": 1.4206, "grad_norm": 0.748309314250946, "learning_rate": 0.0002, "epoch": 3.298409002716337, "step": 4250}, {"loss": 1.4973, "grad_norm": 0.6976589560508728, "learning_rate": 0.0002, "epoch": 3.3061699650756693, "step": 4260}, {"loss": 1.3967, "grad_norm": 0.649780809879303, "learning_rate": 0.0002, "epoch": 3.3139309274350017, "step": 4270}, {"loss": 1.327, "grad_norm": 0.6529902815818787, "learning_rate": 0.0002, "epoch": 3.3216918897943346, "step": 4280}, {"loss": 1.4888, "grad_norm": 0.9273163676261902, "learning_rate": 0.0002, "epoch": 3.329452852153667, "step": 4290}, {"loss": 1.4859, "grad_norm": 0.717024028301239, "learning_rate": 0.0002, "epoch": 3.3372138145129995, "step": 4300}, {"loss": 1.4441, "grad_norm": 0.7914950251579285, "learning_rate": 0.0002, "epoch": 3.3449747768723324, "step": 4310}, {"loss": 1.432, "grad_norm": 0.7133203148841858, "learning_rate": 0.0002, "epoch": 3.352735739231665, "step": 4320}, {"loss": 1.4662, "grad_norm": 0.7409568428993225, "learning_rate": 0.0002, "epoch": 3.3604967015909972, "step": 4330}, {"loss": 1.3992, "grad_norm": 0.6993981003761292, "learning_rate": 0.0002, "epoch": 3.3682576639503297, "step": 4340}, {"loss": 1.4261, "grad_norm": 0.7114535570144653, "learning_rate": 0.0002, "epoch": 3.3760186263096625, "step": 4350}, {"loss": 1.4227, "grad_norm": 0.6790860295295715, "learning_rate": 0.0002, "epoch": 3.383779588668995, "step": 4360}, {"loss": 1.4128, "grad_norm": 0.6507849097251892, "learning_rate": 0.0002, "epoch": 3.3915405510283274, "step": 4370}, {"loss": 1.4559, "grad_norm": 0.5967804193496704, "learning_rate": 0.0002, "epoch": 3.39930151338766, "step": 4380}, {"loss": 1.3687, "grad_norm": 0.6625847816467285, "learning_rate": 0.0002, "epoch": 3.4070624757469927, "step": 4390}, {"loss": 1.4193, "grad_norm": 0.6736508011817932, "learning_rate": 0.0002, "epoch": 3.414823438106325, "step": 4400}, {"loss": 1.4363, "grad_norm": 0.7870860695838928, "learning_rate": 0.0002, "epoch": 3.4225844004656576, "step": 4410}, {"loss": 1.4114, "grad_norm": 0.7205295562744141, "learning_rate": 0.0002, "epoch": 3.4303453628249905, "step": 4420}, {"loss": 1.4131, "grad_norm": 0.6634634137153625, "learning_rate": 0.0002, "epoch": 3.438106325184323, "step": 4430}, {"loss": 1.4683, "grad_norm": 0.7562733292579651, "learning_rate": 0.0002, "epoch": 3.4458672875436553, "step": 4440}, {"loss": 1.3486, "grad_norm": 0.6585879921913147, "learning_rate": 0.0002, "epoch": 3.453628249902988, "step": 4450}, {"loss": 1.4283, "grad_norm": 0.6896792054176331, "learning_rate": 0.0002, "epoch": 3.4613892122623207, "step": 4460}, {"loss": 1.4208, "grad_norm": 0.6520342230796814, "learning_rate": 0.0002, "epoch": 3.469150174621653, "step": 4470}, {"loss": 1.3423, "grad_norm": 0.6760806441307068, "learning_rate": 0.0002, "epoch": 3.4769111369809855, "step": 4480}, {"loss": 1.4398, "grad_norm": 0.7539774179458618, "learning_rate": 0.0002, "epoch": 3.484672099340318, "step": 4490}, {"loss": 1.4534, "grad_norm": 0.7409411668777466, "learning_rate": 0.0002, "epoch": 3.492433061699651, "step": 4500}, {"loss": 1.4069, "grad_norm": 0.6876253485679626, "learning_rate": 0.0002, "epoch": 3.5001940240589833, "step": 4510}, {"loss": 1.4228, "grad_norm": 0.7028461694717407, "learning_rate": 0.0002, "epoch": 3.5079549864183157, "step": 4520}, {"loss": 1.4723, "grad_norm": 0.8056529760360718, "learning_rate": 0.0002, "epoch": 3.5157159487776486, "step": 4530}, {"loss": 1.4148, "grad_norm": 0.711338996887207, "learning_rate": 0.0002, "epoch": 3.523476911136981, "step": 4540}, {"loss": 1.5247, "grad_norm": 0.7343552708625793, "learning_rate": 0.0002, "epoch": 3.5312378734963135, "step": 4550}, {"loss": 1.4308, "grad_norm": 0.745479941368103, "learning_rate": 0.0002, "epoch": 3.5389988358556463, "step": 4560}, {"loss": 1.4229, "grad_norm": 0.7582294940948486, "learning_rate": 0.0002, "epoch": 3.5467597982149788, "step": 4570}, {"loss": 1.4127, "grad_norm": 0.6717444658279419, "learning_rate": 0.0002, "epoch": 3.554520760574311, "step": 4580}, {"loss": 1.4368, "grad_norm": 0.7417883276939392, "learning_rate": 0.0002, "epoch": 3.5622817229336436, "step": 4590}, {"loss": 1.4176, "grad_norm": 0.6385737061500549, "learning_rate": 0.0002, "epoch": 3.570042685292976, "step": 4600}, {"loss": 1.3981, "grad_norm": 0.716704249382019, "learning_rate": 0.0002, "epoch": 3.577803647652309, "step": 4610}, {"loss": 1.3889, "grad_norm": 0.6948980093002319, "learning_rate": 0.0002, "epoch": 3.5855646100116414, "step": 4620}, {"loss": 1.5177, "grad_norm": 0.6961140036582947, "learning_rate": 0.0002, "epoch": 3.593325572370974, "step": 4630}, {"loss": 1.4508, "grad_norm": 0.7493122220039368, "learning_rate": 0.0002, "epoch": 3.6010865347303067, "step": 4640}, {"loss": 1.3987, "grad_norm": 0.7431658506393433, "learning_rate": 0.0002, "epoch": 3.608847497089639, "step": 4650}, {"loss": 1.4551, "grad_norm": 0.8353387713432312, "learning_rate": 0.0002, "epoch": 3.6166084594489716, "step": 4660}, {"loss": 1.4533, "grad_norm": 0.7095612287521362, "learning_rate": 0.0002, "epoch": 3.6243694218083045, "step": 4670}, {"loss": 1.4003, "grad_norm": 0.776620090007782, "learning_rate": 0.0002, "epoch": 3.632130384167637, "step": 4680}, {"loss": 1.4361, "grad_norm": 0.7198925018310547, "learning_rate": 0.0002, "epoch": 3.6398913465269693, "step": 4690}, {"loss": 1.4543, "grad_norm": 0.8238834738731384, "learning_rate": 0.0002, "epoch": 3.6476523088863018, "step": 4700}, {"loss": 1.3958, "grad_norm": 0.6804245710372925, "learning_rate": 0.0002, "epoch": 3.655413271245634, "step": 4710}, {"loss": 1.4158, "grad_norm": 0.8444845676422119, "learning_rate": 0.0002, "epoch": 3.663174233604967, "step": 4720}, {"loss": 1.3825, "grad_norm": 0.743797779083252, "learning_rate": 0.0002, "epoch": 3.6709351959642995, "step": 4730}, {"loss": 1.4213, "grad_norm": 0.8994188904762268, "learning_rate": 0.0002, "epoch": 3.678696158323632, "step": 4740}, {"loss": 1.4281, "grad_norm": 0.75416100025177, "learning_rate": 0.0002, "epoch": 3.686457120682965, "step": 4750}, {"loss": 1.4154, "grad_norm": 0.6499266028404236, "learning_rate": 0.0002, "epoch": 3.6942180830422973, "step": 4760}, {"loss": 1.4005, "grad_norm": 0.7246791124343872, "learning_rate": 0.0002, "epoch": 3.7019790454016297, "step": 4770}, {"loss": 1.426, "grad_norm": 0.7831124067306519, "learning_rate": 0.0002, "epoch": 3.7097400077609626, "step": 4780}, {"loss": 1.3933, "grad_norm": 0.7130028009414673, "learning_rate": 0.0002, "epoch": 3.717500970120295, "step": 4790}, {"loss": 1.4632, "grad_norm": 0.7501602172851562, "learning_rate": 0.0002, "epoch": 3.7252619324796274, "step": 4800}, {"loss": 1.4985, "grad_norm": 0.6980932950973511, "learning_rate": 0.0002, "epoch": 3.73302289483896, "step": 4810}, {"loss": 1.4517, "grad_norm": 0.8050530552864075, "learning_rate": 0.0002, "epoch": 3.7407838571982923, "step": 4820}, {"loss": 1.4703, "grad_norm": 0.6385579705238342, "learning_rate": 0.0002, "epoch": 3.748544819557625, "step": 4830}, {"loss": 1.5281, "grad_norm": 0.6664714813232422, "learning_rate": 0.0002, "epoch": 3.7563057819169576, "step": 4840}, {"loss": 1.4443, "grad_norm": 0.7125676274299622, "learning_rate": 0.0002, "epoch": 3.76406674427629, "step": 4850}, {"loss": 1.3958, "grad_norm": 0.7231866717338562, "learning_rate": 0.0002, "epoch": 3.771827706635623, "step": 4860}, {"loss": 1.4446, "grad_norm": 0.6917183995246887, "learning_rate": 0.0002, "epoch": 3.7795886689949554, "step": 4870}, {"loss": 1.4369, "grad_norm": 0.665037989616394, "learning_rate": 0.0002, "epoch": 3.787349631354288, "step": 4880}, {"loss": 1.4193, "grad_norm": 0.5837726593017578, "learning_rate": 0.0002, "epoch": 3.7951105937136207, "step": 4890}, {"loss": 1.4176, "grad_norm": 0.6366701722145081, "learning_rate": 0.0002, "epoch": 3.802871556072953, "step": 4900}, {"loss": 1.46, "grad_norm": 0.7082223892211914, "learning_rate": 0.0002, "epoch": 3.8106325184322856, "step": 4910}, {"loss": 1.5139, "grad_norm": 0.8101672530174255, "learning_rate": 0.0002, "epoch": 3.818393480791618, "step": 4920}, {"loss": 1.3659, "grad_norm": 0.7516148090362549, "learning_rate": 0.0002, "epoch": 3.826154443150951, "step": 4930}, {"loss": 1.3909, "grad_norm": 0.7928489446640015, "learning_rate": 0.0002, "epoch": 3.8339154055102833, "step": 4940}, {"loss": 1.4255, "grad_norm": 0.6892234683036804, "learning_rate": 0.0002, "epoch": 3.8416763678696157, "step": 4950}, {"loss": 1.5024, "grad_norm": 0.6381304264068604, "learning_rate": 0.0002, "epoch": 3.849437330228948, "step": 4960}, {"loss": 1.4873, "grad_norm": 0.8068831562995911, "learning_rate": 0.0002, "epoch": 3.857198292588281, "step": 4970}, {"loss": 1.45, "grad_norm": 0.7289869785308838, "learning_rate": 0.0002, "epoch": 3.8649592549476135, "step": 4980}, {"loss": 1.398, "grad_norm": 0.7278549075126648, "learning_rate": 0.0002, "epoch": 3.872720217306946, "step": 4990}, {"loss": 1.4442, "grad_norm": 0.7324236631393433, "learning_rate": 0.0002, "epoch": 3.880481179666279, "step": 5000}, {"loss": 1.4511, "grad_norm": 0.6759871244430542, "learning_rate": 0.0002, "epoch": 3.8882421420256112, "step": 5010}, {"loss": 1.4705, "grad_norm": 0.8159207701683044, "learning_rate": 0.0002, "epoch": 3.8960031043849437, "step": 5020}, {"loss": 1.4685, "grad_norm": 0.6536211967468262, "learning_rate": 0.0002, "epoch": 3.9037640667442766, "step": 5030}, {"loss": 1.4335, "grad_norm": 0.6827932000160217, "learning_rate": 0.0002, "epoch": 3.911525029103609, "step": 5040}, {"loss": 1.433, "grad_norm": 0.6688340306282043, "learning_rate": 0.0002, "epoch": 3.9192859914629414, "step": 5050}, {"loss": 1.4099, "grad_norm": 0.6385695934295654, "learning_rate": 0.0002, "epoch": 3.927046953822274, "step": 5060}, {"loss": 1.4767, "grad_norm": 0.6975107192993164, "learning_rate": 0.0002, "epoch": 3.9348079161816063, "step": 5070}, {"loss": 1.4893, "grad_norm": 0.6684112548828125, "learning_rate": 0.0002, "epoch": 3.942568878540939, "step": 5080}, {"loss": 1.4732, "grad_norm": 0.8349628448486328, "learning_rate": 0.0002, "epoch": 3.9503298409002716, "step": 5090}, {"loss": 1.5131, "grad_norm": 0.7146425843238831, "learning_rate": 0.0002, "epoch": 3.958090803259604, "step": 5100}, {"loss": 1.4149, "grad_norm": 0.6555036902427673, "learning_rate": 0.0002, "epoch": 3.965851765618937, "step": 5110}, {"loss": 1.4274, "grad_norm": 0.7037415504455566, "learning_rate": 0.0002, "epoch": 3.9736127279782694, "step": 5120}, {"loss": 1.4292, "grad_norm": 0.7235575914382935, "learning_rate": 0.0002, "epoch": 3.981373690337602, "step": 5130}, {"loss": 1.4455, "grad_norm": 0.7092325687408447, "learning_rate": 0.0002, "epoch": 3.9891346526969347, "step": 5140}, {"loss": 1.4512, "grad_norm": 0.7490319609642029, "learning_rate": 0.0002, "epoch": 3.996895615056267, "step": 5150}]} +{"epoch": 4.9996119518820334, "step": 6442, "epoch_duration": 3756.667355775833, "total_accumulated_duration": 18030.87024497986, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0855, "grad_norm": 1.0751162767410278, "learning_rate": 0.0002, "epoch": 0.007760962359332557, "step": 10}, {"loss": 2.4744, "grad_norm": 0.4697345793247223, "learning_rate": 0.0002, "epoch": 0.015521924718665115, "step": 20}, {"loss": 2.193, "grad_norm": 0.5370839238166809, "learning_rate": 0.0002, "epoch": 0.023282887077997673, "step": 30}, {"loss": 2.0599, "grad_norm": 0.46794816851615906, "learning_rate": 0.0002, "epoch": 0.03104384943733023, "step": 40}, {"loss": 1.9354, "grad_norm": 0.44624820351600647, "learning_rate": 0.0002, "epoch": 0.038804811796662786, "step": 50}, {"loss": 1.9319, "grad_norm": 0.3953201472759247, "learning_rate": 0.0002, "epoch": 0.046565774155995346, "step": 60}, {"loss": 1.9099, "grad_norm": 0.3935912549495697, "learning_rate": 0.0002, "epoch": 0.0543267365153279, "step": 70}, {"loss": 1.8795, "grad_norm": 0.4520699381828308, "learning_rate": 0.0002, "epoch": 0.06208769887466046, "step": 80}, {"loss": 1.8354, "grad_norm": 0.3801847994327545, "learning_rate": 0.0002, "epoch": 0.06984866123399301, "step": 90}, {"loss": 1.9053, "grad_norm": 0.4020165205001831, "learning_rate": 0.0002, "epoch": 0.07760962359332557, "step": 100}, {"loss": 1.8779, "grad_norm": 0.3860672116279602, "learning_rate": 0.0002, "epoch": 0.08537058595265813, "step": 110}, {"loss": 1.8731, "grad_norm": 0.3681113123893738, "learning_rate": 0.0002, "epoch": 0.09313154831199069, "step": 120}, {"loss": 1.8157, "grad_norm": 0.3594866991043091, "learning_rate": 0.0002, "epoch": 0.10089251067132324, "step": 130}, {"loss": 1.8266, "grad_norm": 0.3879193663597107, "learning_rate": 0.0002, "epoch": 0.1086534730306558, "step": 140}, {"loss": 1.8818, "grad_norm": 0.3270505666732788, "learning_rate": 0.0002, "epoch": 0.11641443538998836, "step": 150}, {"loss": 1.87, "grad_norm": 0.36824458837509155, "learning_rate": 0.0002, "epoch": 0.12417539774932092, "step": 160}, {"loss": 1.8305, "grad_norm": 0.383882075548172, "learning_rate": 0.0002, "epoch": 0.13193636010865348, "step": 170}, {"loss": 1.8584, "grad_norm": 0.3368665874004364, "learning_rate": 0.0002, "epoch": 0.13969732246798602, "step": 180}, {"loss": 1.7882, "grad_norm": 0.35961097478866577, "learning_rate": 0.0002, "epoch": 0.1474582848273186, "step": 190}, {"loss": 1.8467, "grad_norm": 0.3415963351726532, "learning_rate": 0.0002, "epoch": 0.15521924718665114, "step": 200}, {"loss": 1.8543, "grad_norm": 0.4100632071495056, "learning_rate": 0.0002, "epoch": 0.1629802095459837, "step": 210}, {"loss": 1.8226, "grad_norm": 0.3516307473182678, "learning_rate": 0.0002, "epoch": 0.17074117190531626, "step": 220}, {"loss": 1.7386, "grad_norm": 0.37919050455093384, "learning_rate": 0.0002, "epoch": 0.1785021342646488, "step": 230}, {"loss": 1.7937, "grad_norm": 0.33270683884620667, "learning_rate": 0.0002, "epoch": 0.18626309662398138, "step": 240}, {"loss": 1.7925, "grad_norm": 0.3348783254623413, "learning_rate": 0.0002, "epoch": 0.19402405898331393, "step": 250}, {"loss": 1.7774, "grad_norm": 0.3888475298881531, "learning_rate": 0.0002, "epoch": 0.20178502134264648, "step": 260}, {"loss": 1.8381, "grad_norm": 0.3554602861404419, "learning_rate": 0.0002, "epoch": 0.20954598370197905, "step": 270}, {"loss": 1.8359, "grad_norm": 0.33277708292007446, "learning_rate": 0.0002, "epoch": 0.2173069460613116, "step": 280}, {"loss": 1.7713, "grad_norm": 0.3281584680080414, "learning_rate": 0.0002, "epoch": 0.22506790842064417, "step": 290}, {"loss": 1.8181, "grad_norm": 0.3185969591140747, "learning_rate": 0.0002, "epoch": 0.23282887077997672, "step": 300}, {"loss": 1.8595, "grad_norm": 0.35335442423820496, "learning_rate": 0.0002, "epoch": 0.24058983313930926, "step": 310}, {"loss": 1.87, "grad_norm": 0.3119595944881439, "learning_rate": 0.0002, "epoch": 0.24835079549864184, "step": 320}, {"loss": 1.8357, "grad_norm": 0.36424458026885986, "learning_rate": 0.0002, "epoch": 0.2561117578579744, "step": 330}, {"loss": 1.8003, "grad_norm": 0.3618951141834259, "learning_rate": 0.0002, "epoch": 0.26387272021730696, "step": 340}, {"loss": 1.8221, "grad_norm": 0.312757670879364, "learning_rate": 0.0002, "epoch": 0.2716336825766395, "step": 350}, {"loss": 1.9031, "grad_norm": 0.326016366481781, "learning_rate": 0.0002, "epoch": 0.27939464493597205, "step": 360}, {"loss": 1.8214, "grad_norm": 0.34093883633613586, "learning_rate": 0.0002, "epoch": 0.2871556072953046, "step": 370}, {"loss": 1.7733, "grad_norm": 0.32325029373168945, "learning_rate": 0.0002, "epoch": 0.2949165696546372, "step": 380}, {"loss": 1.842, "grad_norm": 0.34105437994003296, "learning_rate": 0.0002, "epoch": 0.30267753201396974, "step": 390}, {"loss": 1.7926, "grad_norm": 0.32565295696258545, "learning_rate": 0.0002, "epoch": 0.3104384943733023, "step": 400}, {"loss": 1.8031, "grad_norm": 0.32742050290107727, "learning_rate": 0.0002, "epoch": 0.31819945673263483, "step": 410}, {"loss": 1.907, "grad_norm": 0.30233046412467957, "learning_rate": 0.0002, "epoch": 0.3259604190919674, "step": 420}, {"loss": 1.7623, "grad_norm": 0.32419222593307495, "learning_rate": 0.0002, "epoch": 0.3337213814513, "step": 430}, {"loss": 1.865, "grad_norm": 0.3653007745742798, "learning_rate": 0.0002, "epoch": 0.3414823438106325, "step": 440}, {"loss": 1.8044, "grad_norm": 0.31617099046707153, "learning_rate": 0.0002, "epoch": 0.3492433061699651, "step": 450}, {"loss": 1.7677, "grad_norm": 0.3305962085723877, "learning_rate": 0.0002, "epoch": 0.3570042685292976, "step": 460}, {"loss": 1.8155, "grad_norm": 0.3178933262825012, "learning_rate": 0.0002, "epoch": 0.36476523088863017, "step": 470}, {"loss": 1.7485, "grad_norm": 0.37163782119750977, "learning_rate": 0.0002, "epoch": 0.37252619324796277, "step": 480}, {"loss": 1.8804, "grad_norm": 0.469844788312912, "learning_rate": 0.0002, "epoch": 0.3802871556072953, "step": 490}, {"loss": 1.8343, "grad_norm": 0.3409338593482971, "learning_rate": 0.0002, "epoch": 0.38804811796662786, "step": 500}, {"loss": 1.8433, "grad_norm": 0.31943467259407043, "learning_rate": 0.0002, "epoch": 0.3958090803259604, "step": 510}, {"loss": 1.7873, "grad_norm": 0.32293614745140076, "learning_rate": 0.0002, "epoch": 0.40357004268529295, "step": 520}, {"loss": 1.8584, "grad_norm": 0.2994382977485657, "learning_rate": 0.0002, "epoch": 0.41133100504462555, "step": 530}, {"loss": 1.8153, "grad_norm": 0.3273141384124756, "learning_rate": 0.0002, "epoch": 0.4190919674039581, "step": 540}, {"loss": 1.8097, "grad_norm": 0.3020550012588501, "learning_rate": 0.0002, "epoch": 0.42685292976329064, "step": 550}, {"loss": 1.8551, "grad_norm": 0.30113112926483154, "learning_rate": 0.0002, "epoch": 0.4346138921226232, "step": 560}, {"loss": 1.8084, "grad_norm": 0.30274903774261475, "learning_rate": 0.0002, "epoch": 0.44237485448195574, "step": 570}, {"loss": 1.7673, "grad_norm": 0.3231128454208374, "learning_rate": 0.0002, "epoch": 0.45013581684128834, "step": 580}, {"loss": 1.7848, "grad_norm": 0.3255121409893036, "learning_rate": 0.0002, "epoch": 0.4578967792006209, "step": 590}, {"loss": 1.8227, "grad_norm": 0.30147507786750793, "learning_rate": 0.0002, "epoch": 0.46565774155995343, "step": 600}, {"loss": 1.7572, "grad_norm": 0.29781386256217957, "learning_rate": 0.0002, "epoch": 0.473418703919286, "step": 610}, {"loss": 1.8307, "grad_norm": 0.30914685130119324, "learning_rate": 0.0002, "epoch": 0.4811796662786185, "step": 620}, {"loss": 1.805, "grad_norm": 0.3110593855381012, "learning_rate": 0.0002, "epoch": 0.4889406286379511, "step": 630}, {"loss": 1.8228, "grad_norm": 0.3298132121562958, "learning_rate": 0.0002, "epoch": 0.49670159099728367, "step": 640}, {"loss": 1.7816, "grad_norm": 0.322122186422348, "learning_rate": 0.0002, "epoch": 0.5044625533566163, "step": 650}, {"loss": 1.8001, "grad_norm": 0.3504371643066406, "learning_rate": 0.0002, "epoch": 0.5122235157159488, "step": 660}, {"loss": 1.8682, "grad_norm": 0.3102182149887085, "learning_rate": 0.0002, "epoch": 0.5199844780752814, "step": 670}, {"loss": 1.7494, "grad_norm": 0.6113658547401428, "learning_rate": 0.0002, "epoch": 0.5277454404346139, "step": 680}, {"loss": 1.7096, "grad_norm": 0.31841862201690674, "learning_rate": 0.0002, "epoch": 0.5355064027939465, "step": 690}, {"loss": 1.7587, "grad_norm": 0.2830526530742645, "learning_rate": 0.0002, "epoch": 0.543267365153279, "step": 700}, {"loss": 1.7887, "grad_norm": 0.3048769533634186, "learning_rate": 0.0002, "epoch": 0.5510283275126115, "step": 710}, {"loss": 1.8416, "grad_norm": 0.2719033658504486, "learning_rate": 0.0002, "epoch": 0.5587892898719441, "step": 720}, {"loss": 1.786, "grad_norm": 0.3176722526550293, "learning_rate": 0.0002, "epoch": 0.5665502522312766, "step": 730}, {"loss": 1.7127, "grad_norm": 0.32491734623908997, "learning_rate": 0.0002, "epoch": 0.5743112145906092, "step": 740}, {"loss": 1.7892, "grad_norm": 0.32746851444244385, "learning_rate": 0.0002, "epoch": 0.5820721769499418, "step": 750}, {"loss": 1.7811, "grad_norm": 0.3055773973464966, "learning_rate": 0.0002, "epoch": 0.5898331393092744, "step": 760}, {"loss": 1.8597, "grad_norm": 0.30671584606170654, "learning_rate": 0.0002, "epoch": 0.5975941016686069, "step": 770}, {"loss": 1.7728, "grad_norm": 0.28770264983177185, "learning_rate": 0.0002, "epoch": 0.6053550640279395, "step": 780}, {"loss": 1.7025, "grad_norm": 0.2814285457134247, "learning_rate": 0.0002, "epoch": 0.613116026387272, "step": 790}, {"loss": 1.819, "grad_norm": 0.31554412841796875, "learning_rate": 0.0002, "epoch": 0.6208769887466046, "step": 800}, {"loss": 1.8335, "grad_norm": 0.2984226942062378, "learning_rate": 0.0002, "epoch": 0.6286379511059371, "step": 810}, {"loss": 1.7728, "grad_norm": 0.2859906554222107, "learning_rate": 0.0002, "epoch": 0.6363989134652697, "step": 820}, {"loss": 1.7408, "grad_norm": 0.2887928783893585, "learning_rate": 0.0002, "epoch": 0.6441598758246022, "step": 830}, {"loss": 1.8071, "grad_norm": 0.31287339329719543, "learning_rate": 0.0002, "epoch": 0.6519208381839348, "step": 840}, {"loss": 1.7673, "grad_norm": 0.32064181566238403, "learning_rate": 0.0002, "epoch": 0.6596818005432674, "step": 850}, {"loss": 1.7849, "grad_norm": 0.290981650352478, "learning_rate": 0.0002, "epoch": 0.6674427629026, "step": 860}, {"loss": 1.8089, "grad_norm": 0.33060121536254883, "learning_rate": 0.0002, "epoch": 0.6752037252619325, "step": 870}, {"loss": 1.7357, "grad_norm": 0.27032899856567383, "learning_rate": 0.0002, "epoch": 0.682964687621265, "step": 880}, {"loss": 1.8423, "grad_norm": 0.29031234979629517, "learning_rate": 0.0002, "epoch": 0.6907256499805976, "step": 890}, {"loss": 1.835, "grad_norm": 0.2845142185688019, "learning_rate": 0.0002, "epoch": 0.6984866123399301, "step": 900}, {"loss": 1.77, "grad_norm": 0.8638312816619873, "learning_rate": 0.0002, "epoch": 0.7062475746992627, "step": 910}, {"loss": 1.7757, "grad_norm": 0.3086668848991394, "learning_rate": 0.0002, "epoch": 0.7140085370585952, "step": 920}, {"loss": 1.7541, "grad_norm": 0.2724177837371826, "learning_rate": 0.0002, "epoch": 0.7217694994179278, "step": 930}, {"loss": 1.816, "grad_norm": 0.289559006690979, "learning_rate": 0.0002, "epoch": 0.7295304617772603, "step": 940}, {"loss": 1.7654, "grad_norm": 0.3000658452510834, "learning_rate": 0.0002, "epoch": 0.737291424136593, "step": 950}, {"loss": 1.7736, "grad_norm": 0.33544042706489563, "learning_rate": 0.0002, "epoch": 0.7450523864959255, "step": 960}, {"loss": 1.6979, "grad_norm": 0.28593236207962036, "learning_rate": 0.0002, "epoch": 0.7528133488552581, "step": 970}, {"loss": 1.8583, "grad_norm": 0.313634991645813, "learning_rate": 0.0002, "epoch": 0.7605743112145906, "step": 980}, {"loss": 1.7473, "grad_norm": 0.2949385941028595, "learning_rate": 0.0002, "epoch": 0.7683352735739232, "step": 990}, {"loss": 1.8689, "grad_norm": 0.2920108437538147, "learning_rate": 0.0002, "epoch": 0.7760962359332557, "step": 1000}, {"loss": 1.8401, "grad_norm": 0.3245100677013397, "learning_rate": 0.0002, "epoch": 0.7838571982925883, "step": 1010}, {"loss": 1.7109, "grad_norm": 0.3007619380950928, "learning_rate": 0.0002, "epoch": 0.7916181606519208, "step": 1020}, {"loss": 1.7427, "grad_norm": 0.3630852997303009, "learning_rate": 0.0002, "epoch": 0.7993791230112534, "step": 1030}, {"loss": 1.7655, "grad_norm": 0.2856379747390747, "learning_rate": 0.0002, "epoch": 0.8071400853705859, "step": 1040}, {"loss": 1.8371, "grad_norm": 0.32476478815078735, "learning_rate": 0.0002, "epoch": 0.8149010477299186, "step": 1050}, {"loss": 1.8039, "grad_norm": 0.5162565112113953, "learning_rate": 0.0002, "epoch": 0.8226620100892511, "step": 1060}, {"loss": 1.8862, "grad_norm": 0.316496342420578, "learning_rate": 0.0002, "epoch": 0.8304229724485837, "step": 1070}, {"loss": 1.8023, "grad_norm": 0.31977516412734985, "learning_rate": 0.0002, "epoch": 0.8381839348079162, "step": 1080}, {"loss": 1.8547, "grad_norm": 0.269509494304657, "learning_rate": 0.0002, "epoch": 0.8459448971672487, "step": 1090}, {"loss": 1.7811, "grad_norm": 0.31621453166007996, "learning_rate": 0.0002, "epoch": 0.8537058595265813, "step": 1100}, {"loss": 1.739, "grad_norm": 0.2946535050868988, "learning_rate": 0.0002, "epoch": 0.8614668218859138, "step": 1110}, {"loss": 1.7511, "grad_norm": 0.3088909983634949, "learning_rate": 0.0002, "epoch": 0.8692277842452464, "step": 1120}, {"loss": 1.8228, "grad_norm": 0.33033716678619385, "learning_rate": 0.0002, "epoch": 0.8769887466045789, "step": 1130}, {"loss": 1.7912, "grad_norm": 0.2954833507537842, "learning_rate": 0.0002, "epoch": 0.8847497089639115, "step": 1140}, {"loss": 1.8394, "grad_norm": 0.2950248122215271, "learning_rate": 0.0002, "epoch": 0.8925106713232441, "step": 1150}, {"loss": 1.7068, "grad_norm": 0.296661913394928, "learning_rate": 0.0002, "epoch": 0.9002716336825767, "step": 1160}, {"loss": 1.7967, "grad_norm": 0.35451310873031616, "learning_rate": 0.0002, "epoch": 0.9080325960419092, "step": 1170}, {"loss": 1.8202, "grad_norm": 0.32705947756767273, "learning_rate": 0.0002, "epoch": 0.9157935584012418, "step": 1180}, {"loss": 1.7396, "grad_norm": 0.3333960771560669, "learning_rate": 0.0002, "epoch": 0.9235545207605743, "step": 1190}, {"loss": 1.7801, "grad_norm": 0.3042232096195221, "learning_rate": 0.0002, "epoch": 0.9313154831199069, "step": 1200}, {"loss": 1.7586, "grad_norm": 0.281553715467453, "learning_rate": 0.0002, "epoch": 0.9390764454792394, "step": 1210}, {"loss": 1.7953, "grad_norm": 0.3096391558647156, "learning_rate": 0.0002, "epoch": 0.946837407838572, "step": 1220}, {"loss": 1.7401, "grad_norm": 0.2866271734237671, "learning_rate": 0.0002, "epoch": 0.9545983701979045, "step": 1230}, {"loss": 1.7211, "grad_norm": 0.28394097089767456, "learning_rate": 0.0002, "epoch": 0.962359332557237, "step": 1240}, {"loss": 1.7363, "grad_norm": 0.3249266743659973, "learning_rate": 0.0002, "epoch": 0.9701202949165697, "step": 1250}, {"loss": 1.7563, "grad_norm": 0.2896869480609894, "learning_rate": 0.0002, "epoch": 0.9778812572759022, "step": 1260}, {"loss": 1.6389, "grad_norm": 0.29224586486816406, "learning_rate": 0.0002, "epoch": 0.9856422196352348, "step": 1270}, {"loss": 1.7111, "grad_norm": 0.2820223569869995, "learning_rate": 0.0002, "epoch": 0.9934031819945673, "step": 1280}, {"eval_loss": 1.8081045150756836, "eval_runtime": 102.3056, "eval_samples_per_second": 4.956, "eval_steps_per_second": 0.626, "epoch": 0.9996119518820333, "step": 1288}, {"loss": 1.7518, "grad_norm": 0.3282551169395447, "learning_rate": 0.0002, "epoch": 1.0011641443538999, "step": 1290}, {"loss": 1.6806, "grad_norm": 0.30217495560646057, "learning_rate": 0.0002, "epoch": 1.0089251067132325, "step": 1300}, {"loss": 1.6777, "grad_norm": 0.30801767110824585, "learning_rate": 0.0002, "epoch": 1.016686069072565, "step": 1310}, {"loss": 1.7756, "grad_norm": 0.31816792488098145, "learning_rate": 0.0002, "epoch": 1.0244470314318976, "step": 1320}, {"loss": 1.6986, "grad_norm": 0.27794334292411804, "learning_rate": 0.0002, "epoch": 1.03220799379123, "step": 1330}, {"loss": 1.6931, "grad_norm": 0.3018926680088043, "learning_rate": 0.0002, "epoch": 1.0399689561505627, "step": 1340}, {"loss": 1.7033, "grad_norm": 0.3552975356578827, "learning_rate": 0.0002, "epoch": 1.0477299185098952, "step": 1350}, {"loss": 1.6782, "grad_norm": 0.32590144872665405, "learning_rate": 0.0002, "epoch": 1.0554908808692278, "step": 1360}, {"loss": 1.6479, "grad_norm": 0.3435460925102234, "learning_rate": 0.0002, "epoch": 1.0632518432285603, "step": 1370}, {"loss": 1.7451, "grad_norm": 0.35037797689437866, "learning_rate": 0.0002, "epoch": 1.071012805587893, "step": 1380}, {"loss": 1.7868, "grad_norm": 0.31398263573646545, "learning_rate": 0.0002, "epoch": 1.0787737679472253, "step": 1390}, {"loss": 1.6729, "grad_norm": 0.3134010434150696, "learning_rate": 0.0002, "epoch": 1.086534730306558, "step": 1400}, {"loss": 1.751, "grad_norm": 0.4599704444408417, "learning_rate": 0.0002, "epoch": 1.0942956926658907, "step": 1410}, {"loss": 1.6871, "grad_norm": 0.35852891206741333, "learning_rate": 0.0002, "epoch": 1.102056655025223, "step": 1420}, {"loss": 1.7083, "grad_norm": 0.35628634691238403, "learning_rate": 0.0002, "epoch": 1.1098176173845558, "step": 1430}, {"loss": 1.6166, "grad_norm": 0.3769161105155945, "learning_rate": 0.0002, "epoch": 1.1175785797438882, "step": 1440}, {"loss": 1.7344, "grad_norm": 1.3712416887283325, "learning_rate": 0.0002, "epoch": 1.1253395421032208, "step": 1450}, {"loss": 1.6542, "grad_norm": 0.38406670093536377, "learning_rate": 0.0002, "epoch": 1.1331005044625533, "step": 1460}, {"loss": 1.7104, "grad_norm": 0.3402116000652313, "learning_rate": 0.0002, "epoch": 1.140861466821886, "step": 1470}, {"loss": 1.7074, "grad_norm": 0.341189444065094, "learning_rate": 0.0002, "epoch": 1.1486224291812184, "step": 1480}, {"loss": 1.6468, "grad_norm": 0.36629995703697205, "learning_rate": 0.0002, "epoch": 1.156383391540551, "step": 1490}, {"loss": 1.6952, "grad_norm": 0.3499569296836853, "learning_rate": 0.0002, "epoch": 1.1641443538998835, "step": 1500}, {"loss": 1.6625, "grad_norm": 0.3663063943386078, "learning_rate": 0.0002, "epoch": 1.1719053162592161, "step": 1510}, {"loss": 1.7533, "grad_norm": 0.34851500391960144, "learning_rate": 0.0002, "epoch": 1.1796662786185488, "step": 1520}, {"loss": 1.6092, "grad_norm": 0.35071656107902527, "learning_rate": 0.0002, "epoch": 1.1874272409778812, "step": 1530}, {"loss": 1.7206, "grad_norm": 0.42783796787261963, "learning_rate": 0.0002, "epoch": 1.1951882033372139, "step": 1540}, {"loss": 1.7499, "grad_norm": 0.31830692291259766, "learning_rate": 0.0002, "epoch": 1.2029491656965463, "step": 1550}, {"loss": 1.7372, "grad_norm": 0.3597424626350403, "learning_rate": 0.0002, "epoch": 1.210710128055879, "step": 1560}, {"loss": 1.6386, "grad_norm": 0.35233765840530396, "learning_rate": 0.0002, "epoch": 1.2184710904152114, "step": 1570}, {"loss": 1.6766, "grad_norm": 0.35942912101745605, "learning_rate": 0.0002, "epoch": 1.226232052774544, "step": 1580}, {"loss": 1.6598, "grad_norm": 0.36159393191337585, "learning_rate": 0.0002, "epoch": 1.2339930151338767, "step": 1590}, {"loss": 1.6697, "grad_norm": 0.3328469693660736, "learning_rate": 0.0002, "epoch": 1.2417539774932091, "step": 1600}, {"loss": 1.7594, "grad_norm": 0.3089476525783539, "learning_rate": 0.0002, "epoch": 1.2495149398525418, "step": 1610}, {"loss": 1.6805, "grad_norm": 0.30947765707969666, "learning_rate": 0.0002, "epoch": 1.2572759022118742, "step": 1620}, {"loss": 1.6899, "grad_norm": 0.32154011726379395, "learning_rate": 0.0002, "epoch": 1.265036864571207, "step": 1630}, {"loss": 1.6621, "grad_norm": 0.3480297923088074, "learning_rate": 0.0002, "epoch": 1.2727978269305393, "step": 1640}, {"loss": 1.7087, "grad_norm": 0.39471694827079773, "learning_rate": 0.0002, "epoch": 1.280558789289872, "step": 1650}, {"loss": 1.7608, "grad_norm": 0.35728853940963745, "learning_rate": 0.0002, "epoch": 1.2883197516492044, "step": 1660}, {"loss": 1.7008, "grad_norm": 0.35223081707954407, "learning_rate": 0.0002, "epoch": 1.296080714008537, "step": 1670}, {"loss": 1.7253, "grad_norm": 0.3588867485523224, "learning_rate": 0.0002, "epoch": 1.3038416763678695, "step": 1680}, {"loss": 1.6505, "grad_norm": 0.3528042733669281, "learning_rate": 0.0002, "epoch": 1.3116026387272022, "step": 1690}, {"loss": 1.6945, "grad_norm": 0.35975801944732666, "learning_rate": 0.0002, "epoch": 1.3193636010865348, "step": 1700}, {"loss": 1.6631, "grad_norm": 0.36691880226135254, "learning_rate": 0.0002, "epoch": 1.3271245634458673, "step": 1710}, {"loss": 1.7593, "grad_norm": 0.3787977695465088, "learning_rate": 0.0002, "epoch": 1.3348855258052, "step": 1720}, {"loss": 1.7697, "grad_norm": 0.36614933609962463, "learning_rate": 0.0002, "epoch": 1.3426464881645324, "step": 1730}, {"loss": 1.6487, "grad_norm": 0.3484745919704437, "learning_rate": 0.0002, "epoch": 1.350407450523865, "step": 1740}, {"loss": 1.7054, "grad_norm": 0.36905673146247864, "learning_rate": 0.0002, "epoch": 1.3581684128831975, "step": 1750}, {"loss": 1.7679, "grad_norm": 0.41564738750457764, "learning_rate": 0.0002, "epoch": 1.36592937524253, "step": 1760}, {"loss": 1.6634, "grad_norm": 0.3345205783843994, "learning_rate": 0.0002, "epoch": 1.3736903376018628, "step": 1770}, {"loss": 1.7275, "grad_norm": 0.34926071763038635, "learning_rate": 0.0002, "epoch": 1.3814512999611952, "step": 1780}, {"loss": 1.685, "grad_norm": 0.42004233598709106, "learning_rate": 0.0002, "epoch": 1.3892122623205276, "step": 1790}, {"loss": 1.666, "grad_norm": 0.3576236963272095, "learning_rate": 0.0002, "epoch": 1.3969732246798603, "step": 1800}, {"loss": 1.8516, "grad_norm": 0.3586704432964325, "learning_rate": 0.0002, "epoch": 1.404734187039193, "step": 1810}, {"loss": 1.6171, "grad_norm": 0.3943439722061157, "learning_rate": 0.0002, "epoch": 1.4124951493985254, "step": 1820}, {"loss": 1.6865, "grad_norm": 0.3484877049922943, "learning_rate": 0.0002, "epoch": 1.420256111757858, "step": 1830}, {"loss": 1.7205, "grad_norm": 0.3344518840312958, "learning_rate": 0.0002, "epoch": 1.4280170741171905, "step": 1840}, {"loss": 1.6999, "grad_norm": 0.4345698356628418, "learning_rate": 0.0002, "epoch": 1.4357780364765231, "step": 1850}, {"loss": 1.6855, "grad_norm": 0.5525162220001221, "learning_rate": 0.0002, "epoch": 1.4435389988358556, "step": 1860}, {"loss": 1.7143, "grad_norm": 0.37194496393203735, "learning_rate": 0.0002, "epoch": 1.4512999611951882, "step": 1870}, {"loss": 1.7623, "grad_norm": 0.34570157527923584, "learning_rate": 0.0002, "epoch": 1.4590609235545209, "step": 1880}, {"loss": 1.7, "grad_norm": 0.3512282073497772, "learning_rate": 0.0002, "epoch": 1.4668218859138533, "step": 1890}, {"loss": 1.7225, "grad_norm": 0.3443922996520996, "learning_rate": 0.0002, "epoch": 1.4745828482731858, "step": 1900}, {"loss": 1.7393, "grad_norm": 0.3812018036842346, "learning_rate": 0.0002, "epoch": 1.4823438106325184, "step": 1910}, {"loss": 1.7277, "grad_norm": 0.39263492822647095, "learning_rate": 0.0002, "epoch": 1.490104772991851, "step": 1920}, {"loss": 1.6829, "grad_norm": 0.3146156072616577, "learning_rate": 0.0002, "epoch": 1.4978657353511835, "step": 1930}, {"loss": 1.6881, "grad_norm": 0.3653988540172577, "learning_rate": 0.0002, "epoch": 1.505626697710516, "step": 1940}, {"loss": 1.7064, "grad_norm": 0.3966596722602844, "learning_rate": 0.0002, "epoch": 1.5133876600698488, "step": 1950}, {"loss": 1.6942, "grad_norm": 0.3441697359085083, "learning_rate": 0.0002, "epoch": 1.5211486224291813, "step": 1960}, {"loss": 1.7175, "grad_norm": 0.3328564465045929, "learning_rate": 0.0002, "epoch": 1.5289095847885137, "step": 1970}, {"loss": 1.7394, "grad_norm": 0.34068772196769714, "learning_rate": 0.0002, "epoch": 1.5366705471478463, "step": 1980}, {"loss": 1.7016, "grad_norm": 0.3559795916080475, "learning_rate": 0.0002, "epoch": 1.544431509507179, "step": 1990}, {"loss": 1.7102, "grad_norm": 0.37888768315315247, "learning_rate": 0.0002, "epoch": 1.5521924718665114, "step": 2000}, {"loss": 1.7094, "grad_norm": 0.36128363013267517, "learning_rate": 0.0002, "epoch": 1.5599534342258439, "step": 2010}, {"loss": 1.6407, "grad_norm": 0.3643714487552643, "learning_rate": 0.0002, "epoch": 1.5677143965851765, "step": 2020}, {"loss": 1.6777, "grad_norm": 0.3863612115383148, "learning_rate": 0.0002, "epoch": 1.5754753589445092, "step": 2030}, {"loss": 1.6575, "grad_norm": 0.32831457257270813, "learning_rate": 0.0002, "epoch": 1.5832363213038416, "step": 2040}, {"loss": 1.7404, "grad_norm": 0.36098113656044006, "learning_rate": 0.0002, "epoch": 1.5909972836631743, "step": 2050}, {"loss": 1.7065, "grad_norm": 1.1079334020614624, "learning_rate": 0.0002, "epoch": 1.598758246022507, "step": 2060}, {"loss": 1.6824, "grad_norm": 0.35615381598472595, "learning_rate": 0.0002, "epoch": 1.6065192083818394, "step": 2070}, {"loss": 1.7262, "grad_norm": 0.369711309671402, "learning_rate": 0.0002, "epoch": 1.6142801707411718, "step": 2080}, {"loss": 1.6995, "grad_norm": 0.390658438205719, "learning_rate": 0.0002, "epoch": 1.6220411331005045, "step": 2090}, {"loss": 1.6996, "grad_norm": 0.3422999382019043, "learning_rate": 0.0002, "epoch": 1.6298020954598371, "step": 2100}, {"loss": 1.7135, "grad_norm": 0.372475266456604, "learning_rate": 0.0002, "epoch": 1.6375630578191696, "step": 2110}, {"loss": 1.7216, "grad_norm": 0.35660576820373535, "learning_rate": 0.0002, "epoch": 1.645324020178502, "step": 2120}, {"loss": 1.6991, "grad_norm": 0.35754942893981934, "learning_rate": 0.0002, "epoch": 1.6530849825378346, "step": 2130}, {"loss": 1.6779, "grad_norm": 0.34572410583496094, "learning_rate": 0.0002, "epoch": 1.6608459448971673, "step": 2140}, {"loss": 1.6707, "grad_norm": 0.42059701681137085, "learning_rate": 0.0002, "epoch": 1.6686069072564997, "step": 2150}, {"loss": 1.6782, "grad_norm": 0.35200759768486023, "learning_rate": 0.0002, "epoch": 1.6763678696158324, "step": 2160}, {"loss": 1.6869, "grad_norm": 0.3704029321670532, "learning_rate": 0.0002, "epoch": 1.684128831975165, "step": 2170}, {"loss": 1.7192, "grad_norm": 0.40450501441955566, "learning_rate": 0.0002, "epoch": 1.6918897943344975, "step": 2180}, {"loss": 1.6228, "grad_norm": 0.362966924905777, "learning_rate": 0.0002, "epoch": 1.69965075669383, "step": 2190}, {"loss": 1.6935, "grad_norm": 0.36586204171180725, "learning_rate": 0.0002, "epoch": 1.7074117190531626, "step": 2200}, {"loss": 1.6088, "grad_norm": 0.3295372426509857, "learning_rate": 0.0002, "epoch": 1.7151726814124952, "step": 2210}, {"loss": 1.7844, "grad_norm": 0.3892575800418854, "learning_rate": 0.0002, "epoch": 1.7229336437718277, "step": 2220}, {"loss": 1.7805, "grad_norm": 0.34712135791778564, "learning_rate": 0.0002, "epoch": 1.73069460613116, "step": 2230}, {"loss": 1.7353, "grad_norm": 0.34801796078681946, "learning_rate": 0.0002, "epoch": 1.738455568490493, "step": 2240}, {"loss": 1.7009, "grad_norm": 0.3822397291660309, "learning_rate": 0.0002, "epoch": 1.7462165308498254, "step": 2250}, {"loss": 1.6546, "grad_norm": 0.38933250308036804, "learning_rate": 0.0002, "epoch": 1.7539774932091579, "step": 2260}, {"loss": 1.7245, "grad_norm": 0.3798373341560364, "learning_rate": 0.0002, "epoch": 1.7617384555684905, "step": 2270}, {"loss": 1.6508, "grad_norm": 0.35151317715644836, "learning_rate": 0.0002, "epoch": 1.7694994179278232, "step": 2280}, {"loss": 1.6894, "grad_norm": 0.44981494545936584, "learning_rate": 0.0002, "epoch": 1.7772603802871556, "step": 2290}, {"loss": 1.7271, "grad_norm": 0.3992624580860138, "learning_rate": 0.0002, "epoch": 1.785021342646488, "step": 2300}, {"loss": 1.7252, "grad_norm": 0.3772512376308441, "learning_rate": 0.0002, "epoch": 1.7927823050058207, "step": 2310}, {"loss": 1.7057, "grad_norm": 0.3511589467525482, "learning_rate": 0.0002, "epoch": 1.8005432673651534, "step": 2320}, {"loss": 1.764, "grad_norm": 0.3805285394191742, "learning_rate": 0.0002, "epoch": 1.8083042297244858, "step": 2330}, {"loss": 1.6986, "grad_norm": 0.3792071044445038, "learning_rate": 0.0002, "epoch": 1.8160651920838184, "step": 2340}, {"loss": 1.7759, "grad_norm": 0.36430829763412476, "learning_rate": 0.0002, "epoch": 1.823826154443151, "step": 2350}, {"loss": 1.6773, "grad_norm": 0.36502477526664734, "learning_rate": 0.0002, "epoch": 1.8315871168024835, "step": 2360}, {"loss": 1.8072, "grad_norm": 0.35015153884887695, "learning_rate": 0.0002, "epoch": 1.839348079161816, "step": 2370}, {"loss": 1.7734, "grad_norm": 0.3710903823375702, "learning_rate": 0.0002, "epoch": 1.8471090415211486, "step": 2380}, {"loss": 1.6737, "grad_norm": 0.3542828857898712, "learning_rate": 0.0002, "epoch": 1.8548700038804813, "step": 2390}, {"loss": 1.6783, "grad_norm": 0.35467568039894104, "learning_rate": 0.0002, "epoch": 1.8626309662398137, "step": 2400}, {"loss": 1.7773, "grad_norm": 0.3638560473918915, "learning_rate": 0.0002, "epoch": 1.8703919285991462, "step": 2410}, {"loss": 1.7019, "grad_norm": 0.3823298215866089, "learning_rate": 0.0002, "epoch": 1.8781528909584788, "step": 2420}, {"loss": 1.6935, "grad_norm": 0.3926416337490082, "learning_rate": 0.0002, "epoch": 1.8859138533178115, "step": 2430}, {"loss": 1.71, "grad_norm": 0.3608079254627228, "learning_rate": 0.0002, "epoch": 1.893674815677144, "step": 2440}, {"loss": 1.6654, "grad_norm": 0.3426613509654999, "learning_rate": 0.0002, "epoch": 1.9014357780364766, "step": 2450}, {"loss": 1.6892, "grad_norm": 0.3522338569164276, "learning_rate": 0.0002, "epoch": 1.9091967403958092, "step": 2460}, {"loss": 1.7307, "grad_norm": 0.3608049154281616, "learning_rate": 0.0002, "epoch": 1.9169577027551417, "step": 2470}, {"loss": 1.6823, "grad_norm": 0.3849755525588989, "learning_rate": 0.0002, "epoch": 1.924718665114474, "step": 2480}, {"loss": 1.7518, "grad_norm": 0.4154011011123657, "learning_rate": 0.0002, "epoch": 1.9324796274738067, "step": 2490}, {"loss": 1.7381, "grad_norm": 0.3602796792984009, "learning_rate": 0.0002, "epoch": 1.9402405898331394, "step": 2500}, {"loss": 1.7843, "grad_norm": 0.3702992796897888, "learning_rate": 0.0002, "epoch": 1.9480015521924718, "step": 2510}, {"loss": 1.6669, "grad_norm": 0.3657735288143158, "learning_rate": 0.0002, "epoch": 1.9557625145518043, "step": 2520}, {"loss": 1.5964, "grad_norm": 0.41031739115715027, "learning_rate": 0.0002, "epoch": 1.963523476911137, "step": 2530}, {"loss": 1.6745, "grad_norm": 0.34578680992126465, "learning_rate": 0.0002, "epoch": 1.9712844392704696, "step": 2540}, {"loss": 1.723, "grad_norm": 0.3361521065235138, "learning_rate": 0.0002, "epoch": 1.979045401629802, "step": 2550}, {"loss": 1.6868, "grad_norm": 0.34342363476753235, "learning_rate": 0.0002, "epoch": 1.9868063639891347, "step": 2560}, {"loss": 1.6577, "grad_norm": 0.32954007387161255, "learning_rate": 0.0002, "epoch": 1.9945673263484673, "step": 2570}, {"eval_loss": 1.8068748712539673, "eval_runtime": 105.5885, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.606, "epoch": 2.0, "step": 2577}, {"loss": 1.634, "grad_norm": 0.336302250623703, "learning_rate": 0.0002, "epoch": 2.0023282887077998, "step": 2580}, {"loss": 1.612, "grad_norm": 0.3627048432826996, "learning_rate": 0.0002, "epoch": 2.010089251067132, "step": 2590}, {"loss": 1.4908, "grad_norm": 0.38406702876091003, "learning_rate": 0.0002, "epoch": 2.017850213426465, "step": 2600}, {"loss": 1.5368, "grad_norm": 0.5326781272888184, "learning_rate": 0.0002, "epoch": 2.0256111757857975, "step": 2610}, {"loss": 1.5727, "grad_norm": 0.4774554967880249, "learning_rate": 0.0002, "epoch": 2.03337213814513, "step": 2620}, {"loss": 1.5422, "grad_norm": 0.4251810312271118, "learning_rate": 0.0002, "epoch": 2.0411331005044624, "step": 2630}, {"loss": 1.5152, "grad_norm": 0.4693007171154022, "learning_rate": 0.0002, "epoch": 2.0488940628637953, "step": 2640}, {"loss": 1.6137, "grad_norm": 0.46371519565582275, "learning_rate": 0.0002, "epoch": 2.0566550252231277, "step": 2650}, {"loss": 1.6304, "grad_norm": 0.46652570366859436, "learning_rate": 0.0002, "epoch": 2.06441598758246, "step": 2660}, {"loss": 1.6022, "grad_norm": 0.45200315117836, "learning_rate": 0.0002, "epoch": 2.0721769499417926, "step": 2670}, {"loss": 1.5358, "grad_norm": 0.42905205488204956, "learning_rate": 0.0002, "epoch": 2.0799379123011255, "step": 2680}, {"loss": 1.5401, "grad_norm": 0.44509148597717285, "learning_rate": 0.0002, "epoch": 2.087698874660458, "step": 2690}, {"loss": 1.5303, "grad_norm": 0.4445319175720215, "learning_rate": 0.0002, "epoch": 2.0954598370197903, "step": 2700}, {"loss": 1.5701, "grad_norm": 0.46825504302978516, "learning_rate": 0.0002, "epoch": 2.103220799379123, "step": 2710}, {"loss": 1.5751, "grad_norm": 0.4623856842517853, "learning_rate": 0.0002, "epoch": 2.1109817617384556, "step": 2720}, {"loss": 1.5601, "grad_norm": 0.4833452105522156, "learning_rate": 0.0002, "epoch": 2.118742724097788, "step": 2730}, {"loss": 1.5997, "grad_norm": 0.4582686722278595, "learning_rate": 0.0002, "epoch": 2.1265036864571205, "step": 2740}, {"loss": 1.5801, "grad_norm": 0.47587934136390686, "learning_rate": 0.0002, "epoch": 2.1342646488164534, "step": 2750}, {"loss": 1.594, "grad_norm": 0.4602217972278595, "learning_rate": 0.0002, "epoch": 2.142025611175786, "step": 2760}, {"loss": 1.5271, "grad_norm": 0.47501352429389954, "learning_rate": 0.0002, "epoch": 2.1497865735351183, "step": 2770}, {"loss": 1.4862, "grad_norm": 0.5078499913215637, "learning_rate": 0.0002, "epoch": 2.1575475358944507, "step": 2780}, {"loss": 1.6236, "grad_norm": 0.497704416513443, "learning_rate": 0.0002, "epoch": 2.1653084982537836, "step": 2790}, {"loss": 1.5597, "grad_norm": 0.5435971617698669, "learning_rate": 0.0002, "epoch": 2.173069460613116, "step": 2800}, {"loss": 1.5926, "grad_norm": 0.5172356367111206, "learning_rate": 0.0002, "epoch": 2.1808304229724484, "step": 2810}, {"loss": 1.5202, "grad_norm": 0.44063422083854675, "learning_rate": 0.0002, "epoch": 2.1885913853317813, "step": 2820}, {"loss": 1.6041, "grad_norm": 0.5079569220542908, "learning_rate": 0.0002, "epoch": 2.1963523476911138, "step": 2830}, {"loss": 1.5915, "grad_norm": 0.45658132433891296, "learning_rate": 0.0002, "epoch": 2.204113310050446, "step": 2840}, {"loss": 1.5546, "grad_norm": 0.5103023648262024, "learning_rate": 0.0002, "epoch": 2.2118742724097786, "step": 2850}, {"loss": 1.6197, "grad_norm": 0.4882226288318634, "learning_rate": 0.0002, "epoch": 2.2196352347691115, "step": 2860}, {"loss": 1.5996, "grad_norm": 0.5087296962738037, "learning_rate": 0.0002, "epoch": 2.227396197128444, "step": 2870}, {"loss": 1.5451, "grad_norm": 0.45293712615966797, "learning_rate": 0.0002, "epoch": 2.2351571594877764, "step": 2880}, {"loss": 1.6214, "grad_norm": 0.5120379328727722, "learning_rate": 0.0002, "epoch": 2.242918121847109, "step": 2890}, {"loss": 1.5273, "grad_norm": 0.47126415371894836, "learning_rate": 0.0002, "epoch": 2.2506790842064417, "step": 2900}, {"loss": 1.612, "grad_norm": 0.44005846977233887, "learning_rate": 0.0002, "epoch": 2.258440046565774, "step": 2910}, {"loss": 1.6023, "grad_norm": 0.46476176381111145, "learning_rate": 0.0002, "epoch": 2.2662010089251066, "step": 2920}, {"loss": 1.6417, "grad_norm": 0.48051515221595764, "learning_rate": 0.0002, "epoch": 2.2739619712844394, "step": 2930}, {"loss": 1.587, "grad_norm": 0.480069637298584, "learning_rate": 0.0002, "epoch": 2.281722933643772, "step": 2940}, {"loss": 1.5747, "grad_norm": 0.5122102499008179, "learning_rate": 0.0002, "epoch": 2.2894838960031043, "step": 2950}, {"loss": 1.5183, "grad_norm": 0.48879891633987427, "learning_rate": 0.0002, "epoch": 2.2972448583624367, "step": 2960}, {"loss": 1.5483, "grad_norm": 0.4973136782646179, "learning_rate": 0.0002, "epoch": 2.3050058207217696, "step": 2970}, {"loss": 1.677, "grad_norm": 0.5522695183753967, "learning_rate": 0.0002, "epoch": 2.312766783081102, "step": 2980}, {"loss": 1.5946, "grad_norm": 0.5220217704772949, "learning_rate": 0.0002, "epoch": 2.3205277454404345, "step": 2990}, {"loss": 1.6299, "grad_norm": 0.4978662431240082, "learning_rate": 0.0002, "epoch": 2.328288707799767, "step": 3000}, {"loss": 1.5498, "grad_norm": 0.554053544998169, "learning_rate": 0.0002, "epoch": 2.3360496701591, "step": 3010}, {"loss": 1.5356, "grad_norm": 0.4703886806964874, "learning_rate": 0.0002, "epoch": 2.3438106325184322, "step": 3020}, {"loss": 1.5418, "grad_norm": 0.5074123740196228, "learning_rate": 0.0002, "epoch": 2.3515715948777647, "step": 3030}, {"loss": 1.6873, "grad_norm": 0.5088278651237488, "learning_rate": 0.0002, "epoch": 2.3593325572370976, "step": 3040}, {"loss": 1.5249, "grad_norm": 0.4752114415168762, "learning_rate": 0.0002, "epoch": 2.36709351959643, "step": 3050}, {"loss": 1.5353, "grad_norm": 0.5121659636497498, "learning_rate": 0.0002, "epoch": 2.3748544819557624, "step": 3060}, {"loss": 1.6426, "grad_norm": 0.48649218678474426, "learning_rate": 0.0002, "epoch": 2.3826154443150953, "step": 3070}, {"loss": 1.6136, "grad_norm": 0.5209488868713379, "learning_rate": 0.0002, "epoch": 2.3903764066744277, "step": 3080}, {"loss": 1.597, "grad_norm": 0.5110517740249634, "learning_rate": 0.0002, "epoch": 2.39813736903376, "step": 3090}, {"loss": 1.5773, "grad_norm": 0.5609337091445923, "learning_rate": 0.0002, "epoch": 2.4058983313930926, "step": 3100}, {"loss": 1.5438, "grad_norm": 0.5191826224327087, "learning_rate": 0.0002, "epoch": 2.4136592937524255, "step": 3110}, {"loss": 1.6347, "grad_norm": 0.4876069724559784, "learning_rate": 0.0002, "epoch": 2.421420256111758, "step": 3120}, {"loss": 1.5565, "grad_norm": 0.4713933765888214, "learning_rate": 0.0002, "epoch": 2.4291812184710904, "step": 3130}, {"loss": 1.6388, "grad_norm": 0.5102227330207825, "learning_rate": 0.0002, "epoch": 2.436942180830423, "step": 3140}, {"loss": 1.5667, "grad_norm": 0.44546666741371155, "learning_rate": 0.0002, "epoch": 2.4447031431897557, "step": 3150}, {"loss": 1.5973, "grad_norm": 0.5167558193206787, "learning_rate": 0.0002, "epoch": 2.452464105549088, "step": 3160}, {"loss": 1.5673, "grad_norm": 0.5226958990097046, "learning_rate": 0.0002, "epoch": 2.4602250679084205, "step": 3170}, {"loss": 1.5758, "grad_norm": 0.4751799702644348, "learning_rate": 0.0002, "epoch": 2.4679860302677534, "step": 3180}, {"loss": 1.6234, "grad_norm": 0.4744729697704315, "learning_rate": 0.0002, "epoch": 2.475746992627086, "step": 3190}, {"loss": 1.5661, "grad_norm": 0.5203230381011963, "learning_rate": 0.0002, "epoch": 2.4835079549864183, "step": 3200}, {"loss": 1.493, "grad_norm": 0.47209781408309937, "learning_rate": 0.0002, "epoch": 2.4912689173457507, "step": 3210}, {"loss": 1.6415, "grad_norm": 0.5241674780845642, "learning_rate": 0.0002, "epoch": 2.4990298797050836, "step": 3220}, {"loss": 1.6324, "grad_norm": 0.5152244567871094, "learning_rate": 0.0002, "epoch": 2.506790842064416, "step": 3230}, {"loss": 1.6248, "grad_norm": 0.5216741561889648, "learning_rate": 0.0002, "epoch": 2.5145518044237485, "step": 3240}, {"loss": 1.5668, "grad_norm": 0.4953259527683258, "learning_rate": 0.0002, "epoch": 2.522312766783081, "step": 3250}, {"loss": 1.666, "grad_norm": 0.5973829030990601, "learning_rate": 0.0002, "epoch": 2.530073729142414, "step": 3260}, {"loss": 1.5295, "grad_norm": 0.48804202675819397, "learning_rate": 0.0002, "epoch": 2.5378346915017462, "step": 3270}, {"loss": 1.4954, "grad_norm": 0.5334644317626953, "learning_rate": 0.0002, "epoch": 2.5455956538610787, "step": 3280}, {"loss": 1.5814, "grad_norm": 0.46873313188552856, "learning_rate": 0.0002, "epoch": 2.5533566162204115, "step": 3290}, {"loss": 1.5362, "grad_norm": 0.4282589554786682, "learning_rate": 0.0002, "epoch": 2.561117578579744, "step": 3300}, {"loss": 1.6278, "grad_norm": 0.4848293960094452, "learning_rate": 0.0002, "epoch": 2.5688785409390764, "step": 3310}, {"loss": 1.6308, "grad_norm": 0.5093745589256287, "learning_rate": 0.0002, "epoch": 2.576639503298409, "step": 3320}, {"loss": 1.6375, "grad_norm": 0.5084842443466187, "learning_rate": 0.0002, "epoch": 2.5844004656577413, "step": 3330}, {"loss": 1.6168, "grad_norm": 0.4696281850337982, "learning_rate": 0.0002, "epoch": 2.592161428017074, "step": 3340}, {"loss": 1.5359, "grad_norm": 0.5767765641212463, "learning_rate": 0.0002, "epoch": 2.5999223903764066, "step": 3350}, {"loss": 1.6097, "grad_norm": 0.47300875186920166, "learning_rate": 0.0002, "epoch": 2.607683352735739, "step": 3360}, {"loss": 1.6138, "grad_norm": 0.4809158146381378, "learning_rate": 0.0002, "epoch": 2.615444315095072, "step": 3370}, {"loss": 1.4952, "grad_norm": 0.5141063928604126, "learning_rate": 0.0002, "epoch": 2.6232052774544043, "step": 3380}, {"loss": 1.5784, "grad_norm": 0.4832935035228729, "learning_rate": 0.0002, "epoch": 2.630966239813737, "step": 3390}, {"loss": 1.5796, "grad_norm": 0.5044625401496887, "learning_rate": 0.0002, "epoch": 2.6387272021730697, "step": 3400}, {"loss": 1.6202, "grad_norm": 0.5287680625915527, "learning_rate": 0.0002, "epoch": 2.646488164532402, "step": 3410}, {"loss": 1.5423, "grad_norm": 0.5306379795074463, "learning_rate": 0.0002, "epoch": 2.6542491268917345, "step": 3420}, {"loss": 1.5264, "grad_norm": 0.5849291682243347, "learning_rate": 0.0002, "epoch": 2.662010089251067, "step": 3430}, {"loss": 1.5937, "grad_norm": 0.7951080799102783, "learning_rate": 0.0002, "epoch": 2.6697710516104, "step": 3440}, {"loss": 1.5791, "grad_norm": 0.48087653517723083, "learning_rate": 0.0002, "epoch": 2.6775320139697323, "step": 3450}, {"loss": 1.6769, "grad_norm": 0.5396431684494019, "learning_rate": 0.0002, "epoch": 2.6852929763290647, "step": 3460}, {"loss": 1.606, "grad_norm": 0.5481634736061096, "learning_rate": 0.0002, "epoch": 2.693053938688397, "step": 3470}, {"loss": 1.6436, "grad_norm": 0.5068731307983398, "learning_rate": 0.0002, "epoch": 2.70081490104773, "step": 3480}, {"loss": 1.5738, "grad_norm": 0.5759826898574829, "learning_rate": 0.0002, "epoch": 2.7085758634070625, "step": 3490}, {"loss": 1.596, "grad_norm": 0.7253932952880859, "learning_rate": 0.0002, "epoch": 2.716336825766395, "step": 3500}, {"loss": 1.5791, "grad_norm": 0.527745246887207, "learning_rate": 0.0002, "epoch": 2.724097788125728, "step": 3510}, {"loss": 1.5874, "grad_norm": 0.5279242396354675, "learning_rate": 0.0002, "epoch": 2.73185875048506, "step": 3520}, {"loss": 1.6768, "grad_norm": 0.5047839283943176, "learning_rate": 0.0002, "epoch": 2.7396197128443927, "step": 3530}, {"loss": 1.5517, "grad_norm": 0.5430883169174194, "learning_rate": 0.0002, "epoch": 2.7473806752037255, "step": 3540}, {"loss": 1.5624, "grad_norm": 0.4496723711490631, "learning_rate": 0.0002, "epoch": 2.755141637563058, "step": 3550}, {"loss": 1.5789, "grad_norm": 0.5063338875770569, "learning_rate": 0.0002, "epoch": 2.7629025999223904, "step": 3560}, {"loss": 1.52, "grad_norm": 0.4619026780128479, "learning_rate": 0.0002, "epoch": 2.770663562281723, "step": 3570}, {"loss": 1.5793, "grad_norm": 0.4753304123878479, "learning_rate": 0.0002, "epoch": 2.7784245246410553, "step": 3580}, {"loss": 1.5715, "grad_norm": 0.5422708988189697, "learning_rate": 0.0002, "epoch": 2.786185487000388, "step": 3590}, {"loss": 1.5926, "grad_norm": 0.4756578803062439, "learning_rate": 0.0002, "epoch": 2.7939464493597206, "step": 3600}, {"loss": 1.5358, "grad_norm": 0.5057567358016968, "learning_rate": 0.0002, "epoch": 2.801707411719053, "step": 3610}, {"loss": 1.6131, "grad_norm": 0.5410919785499573, "learning_rate": 0.0002, "epoch": 2.809468374078386, "step": 3620}, {"loss": 1.5573, "grad_norm": 0.4958136975765228, "learning_rate": 0.0002, "epoch": 2.8172293364377183, "step": 3630}, {"loss": 1.6324, "grad_norm": 0.454527348279953, "learning_rate": 0.0002, "epoch": 2.8249902987970508, "step": 3640}, {"loss": 1.5582, "grad_norm": 0.5092706084251404, "learning_rate": 0.0002, "epoch": 2.8327512611563836, "step": 3650}, {"loss": 1.5893, "grad_norm": 0.5314022302627563, "learning_rate": 0.0002, "epoch": 2.840512223515716, "step": 3660}, {"loss": 1.588, "grad_norm": 0.5028239488601685, "learning_rate": 0.0002, "epoch": 2.8482731858750485, "step": 3670}, {"loss": 1.5751, "grad_norm": 0.5127444863319397, "learning_rate": 0.0002, "epoch": 2.856034148234381, "step": 3680}, {"loss": 1.6018, "grad_norm": 0.5045645236968994, "learning_rate": 0.0002, "epoch": 2.8637951105937134, "step": 3690}, {"loss": 1.5788, "grad_norm": 0.5560781955718994, "learning_rate": 0.0002, "epoch": 2.8715560729530463, "step": 3700}, {"loss": 1.5988, "grad_norm": 0.5177600383758545, "learning_rate": 0.0002, "epoch": 2.8793170353123787, "step": 3710}, {"loss": 1.6009, "grad_norm": 0.45830899477005005, "learning_rate": 0.0002, "epoch": 2.887077997671711, "step": 3720}, {"loss": 1.6344, "grad_norm": 0.4828629195690155, "learning_rate": 0.0002, "epoch": 2.894838960031044, "step": 3730}, {"loss": 1.6758, "grad_norm": 0.48241183161735535, "learning_rate": 0.0002, "epoch": 2.9025999223903765, "step": 3740}, {"loss": 1.5649, "grad_norm": 0.4909592568874359, "learning_rate": 0.0002, "epoch": 2.910360884749709, "step": 3750}, {"loss": 1.4927, "grad_norm": 0.44677025079727173, "learning_rate": 0.0002, "epoch": 2.9181218471090418, "step": 3760}, {"loss": 1.5067, "grad_norm": 0.4928834140300751, "learning_rate": 0.0002, "epoch": 2.925882809468374, "step": 3770}, {"loss": 1.5843, "grad_norm": 0.5673553347587585, "learning_rate": 0.0002, "epoch": 2.9336437718277066, "step": 3780}, {"loss": 1.5566, "grad_norm": 0.548190712928772, "learning_rate": 0.0002, "epoch": 2.941404734187039, "step": 3790}, {"loss": 1.5892, "grad_norm": 0.48979803919792175, "learning_rate": 0.0002, "epoch": 2.9491656965463715, "step": 3800}, {"loss": 1.5589, "grad_norm": 0.533191978931427, "learning_rate": 0.0002, "epoch": 2.9569266589057044, "step": 3810}, {"loss": 1.584, "grad_norm": 0.5362946391105652, "learning_rate": 0.0002, "epoch": 2.964687621265037, "step": 3820}, {"loss": 1.6602, "grad_norm": 0.4724906384944916, "learning_rate": 0.0002, "epoch": 2.9724485836243693, "step": 3830}, {"loss": 1.5834, "grad_norm": 0.5468461513519287, "learning_rate": 0.0002, "epoch": 2.980209545983702, "step": 3840}, {"loss": 1.6316, "grad_norm": 0.4697108864784241, "learning_rate": 0.0002, "epoch": 2.9879705083430346, "step": 3850}, {"loss": 1.6312, "grad_norm": 0.4780906140804291, "learning_rate": 0.0002, "epoch": 2.995731470702367, "step": 3860}, {"eval_loss": 1.8472607135772705, "eval_runtime": 106.5541, "eval_samples_per_second": 4.758, "eval_steps_per_second": 0.601, "epoch": 2.9996119518820334, "step": 3865}, {"loss": 1.4983, "grad_norm": 0.5645653605461121, "learning_rate": 0.0002, "epoch": 3.0034924330616994, "step": 3870}, {"loss": 1.4334, "grad_norm": 0.6457151174545288, "learning_rate": 0.0002, "epoch": 3.0112533954210323, "step": 3880}, {"loss": 1.3899, "grad_norm": 0.583838164806366, "learning_rate": 0.0002, "epoch": 3.0190143577803648, "step": 3890}, {"loss": 1.3258, "grad_norm": 0.6819260120391846, "learning_rate": 0.0002, "epoch": 3.026775320139697, "step": 3900}, {"loss": 1.3458, "grad_norm": 0.6692903637886047, "learning_rate": 0.0002, "epoch": 3.03453628249903, "step": 3910}, {"loss": 1.4356, "grad_norm": 0.6101024746894836, "learning_rate": 0.0002, "epoch": 3.0422972448583625, "step": 3920}, {"loss": 1.394, "grad_norm": 0.7014093399047852, "learning_rate": 0.0002, "epoch": 3.050058207217695, "step": 3930}, {"loss": 1.3885, "grad_norm": 0.7380381226539612, "learning_rate": 0.0002, "epoch": 3.0578191695770274, "step": 3940}, {"loss": 1.4206, "grad_norm": 0.6607900857925415, "learning_rate": 0.0002, "epoch": 3.0655801319363603, "step": 3950}, {"loss": 1.4293, "grad_norm": 0.735263466835022, "learning_rate": 0.0002, "epoch": 3.0733410942956927, "step": 3960}, {"loss": 1.3966, "grad_norm": 0.6788513660430908, "learning_rate": 0.0002, "epoch": 3.081102056655025, "step": 3970}, {"loss": 1.3435, "grad_norm": 0.6347652673721313, "learning_rate": 0.0002, "epoch": 3.088863019014358, "step": 3980}, {"loss": 1.4518, "grad_norm": 0.7056642770767212, "learning_rate": 0.0002, "epoch": 3.0966239813736904, "step": 3990}, {"loss": 1.4474, "grad_norm": 0.6387075185775757, "learning_rate": 0.0002, "epoch": 3.104384943733023, "step": 4000}, {"loss": 1.3833, "grad_norm": 0.6701116561889648, "learning_rate": 0.0002, "epoch": 3.1121459060923553, "step": 4010}, {"loss": 1.404, "grad_norm": 0.7558449506759644, "learning_rate": 0.0002, "epoch": 3.119906868451688, "step": 4020}, {"loss": 1.3294, "grad_norm": 0.6612881422042847, "learning_rate": 0.0002, "epoch": 3.1276678308110206, "step": 4030}, {"loss": 1.439, "grad_norm": 0.7474587559700012, "learning_rate": 0.0002, "epoch": 3.135428793170353, "step": 4040}, {"loss": 1.4616, "grad_norm": 0.7292373776435852, "learning_rate": 0.0002, "epoch": 3.1431897555296855, "step": 4050}, {"loss": 1.3908, "grad_norm": 0.7432886958122253, "learning_rate": 0.0002, "epoch": 3.1509507178890184, "step": 4060}, {"loss": 1.4214, "grad_norm": 0.6366098523139954, "learning_rate": 0.0002, "epoch": 3.158711680248351, "step": 4070}, {"loss": 1.5044, "grad_norm": 0.6837611794471741, "learning_rate": 0.0002, "epoch": 3.1664726426076832, "step": 4080}, {"loss": 1.4332, "grad_norm": 0.7194393277168274, "learning_rate": 0.0002, "epoch": 3.174233604967016, "step": 4090}, {"loss": 1.3628, "grad_norm": 0.6963607668876648, "learning_rate": 0.0002, "epoch": 3.1819945673263486, "step": 4100}, {"loss": 1.4127, "grad_norm": 0.6404902935028076, "learning_rate": 0.0002, "epoch": 3.189755529685681, "step": 4110}, {"loss": 1.4394, "grad_norm": 0.7172070741653442, "learning_rate": 0.0002, "epoch": 3.1975164920450134, "step": 4120}, {"loss": 1.4658, "grad_norm": 0.6577759385108948, "learning_rate": 0.0002, "epoch": 3.2052774544043463, "step": 4130}, {"loss": 1.4019, "grad_norm": 0.6658480167388916, "learning_rate": 0.0002, "epoch": 3.2130384167636787, "step": 4140}, {"loss": 1.4348, "grad_norm": 0.6771699786186218, "learning_rate": 0.0002, "epoch": 3.220799379123011, "step": 4150}, {"loss": 1.4736, "grad_norm": 0.699035108089447, "learning_rate": 0.0002, "epoch": 3.2285603414823436, "step": 4160}, {"loss": 1.4096, "grad_norm": 0.7218514680862427, "learning_rate": 0.0002, "epoch": 3.2363213038416765, "step": 4170}, {"loss": 1.3637, "grad_norm": 0.6270631551742554, "learning_rate": 0.0002, "epoch": 3.244082266201009, "step": 4180}, {"loss": 1.4076, "grad_norm": 0.6828921437263489, "learning_rate": 0.0002, "epoch": 3.2518432285603414, "step": 4190}, {"loss": 1.4663, "grad_norm": 0.6005498170852661, "learning_rate": 0.0002, "epoch": 3.2596041909196742, "step": 4200}, {"loss": 1.4798, "grad_norm": 0.6974790692329407, "learning_rate": 0.0002, "epoch": 3.2673651532790067, "step": 4210}, {"loss": 1.5012, "grad_norm": 0.7269543409347534, "learning_rate": 0.0002, "epoch": 3.275126115638339, "step": 4220}, {"loss": 1.3848, "grad_norm": 0.6728787422180176, "learning_rate": 0.0002, "epoch": 3.2828870779976715, "step": 4230}, {"loss": 1.4112, "grad_norm": 0.676972508430481, "learning_rate": 0.0002, "epoch": 3.2906480403570044, "step": 4240}, {"loss": 1.4206, "grad_norm": 0.748309314250946, "learning_rate": 0.0002, "epoch": 3.298409002716337, "step": 4250}, {"loss": 1.4973, "grad_norm": 0.6976589560508728, "learning_rate": 0.0002, "epoch": 3.3061699650756693, "step": 4260}, {"loss": 1.3967, "grad_norm": 0.649780809879303, "learning_rate": 0.0002, "epoch": 3.3139309274350017, "step": 4270}, {"loss": 1.327, "grad_norm": 0.6529902815818787, "learning_rate": 0.0002, "epoch": 3.3216918897943346, "step": 4280}, {"loss": 1.4888, "grad_norm": 0.9273163676261902, "learning_rate": 0.0002, "epoch": 3.329452852153667, "step": 4290}, {"loss": 1.4859, "grad_norm": 0.717024028301239, "learning_rate": 0.0002, "epoch": 3.3372138145129995, "step": 4300}, {"loss": 1.4441, "grad_norm": 0.7914950251579285, "learning_rate": 0.0002, "epoch": 3.3449747768723324, "step": 4310}, {"loss": 1.432, "grad_norm": 0.7133203148841858, "learning_rate": 0.0002, "epoch": 3.352735739231665, "step": 4320}, {"loss": 1.4662, "grad_norm": 0.7409568428993225, "learning_rate": 0.0002, "epoch": 3.3604967015909972, "step": 4330}, {"loss": 1.3992, "grad_norm": 0.6993981003761292, "learning_rate": 0.0002, "epoch": 3.3682576639503297, "step": 4340}, {"loss": 1.4261, "grad_norm": 0.7114535570144653, "learning_rate": 0.0002, "epoch": 3.3760186263096625, "step": 4350}, {"loss": 1.4227, "grad_norm": 0.6790860295295715, "learning_rate": 0.0002, "epoch": 3.383779588668995, "step": 4360}, {"loss": 1.4128, "grad_norm": 0.6507849097251892, "learning_rate": 0.0002, "epoch": 3.3915405510283274, "step": 4370}, {"loss": 1.4559, "grad_norm": 0.5967804193496704, "learning_rate": 0.0002, "epoch": 3.39930151338766, "step": 4380}, {"loss": 1.3687, "grad_norm": 0.6625847816467285, "learning_rate": 0.0002, "epoch": 3.4070624757469927, "step": 4390}, {"loss": 1.4193, "grad_norm": 0.6736508011817932, "learning_rate": 0.0002, "epoch": 3.414823438106325, "step": 4400}, {"loss": 1.4363, "grad_norm": 0.7870860695838928, "learning_rate": 0.0002, "epoch": 3.4225844004656576, "step": 4410}, {"loss": 1.4114, "grad_norm": 0.7205295562744141, "learning_rate": 0.0002, "epoch": 3.4303453628249905, "step": 4420}, {"loss": 1.4131, "grad_norm": 0.6634634137153625, "learning_rate": 0.0002, "epoch": 3.438106325184323, "step": 4430}, {"loss": 1.4683, "grad_norm": 0.7562733292579651, "learning_rate": 0.0002, "epoch": 3.4458672875436553, "step": 4440}, {"loss": 1.3486, "grad_norm": 0.6585879921913147, "learning_rate": 0.0002, "epoch": 3.453628249902988, "step": 4450}, {"loss": 1.4283, "grad_norm": 0.6896792054176331, "learning_rate": 0.0002, "epoch": 3.4613892122623207, "step": 4460}, {"loss": 1.4208, "grad_norm": 0.6520342230796814, "learning_rate": 0.0002, "epoch": 3.469150174621653, "step": 4470}, {"loss": 1.3423, "grad_norm": 0.6760806441307068, "learning_rate": 0.0002, "epoch": 3.4769111369809855, "step": 4480}, {"loss": 1.4398, "grad_norm": 0.7539774179458618, "learning_rate": 0.0002, "epoch": 3.484672099340318, "step": 4490}, {"loss": 1.4534, "grad_norm": 0.7409411668777466, "learning_rate": 0.0002, "epoch": 3.492433061699651, "step": 4500}, {"loss": 1.4069, "grad_norm": 0.6876253485679626, "learning_rate": 0.0002, "epoch": 3.5001940240589833, "step": 4510}, {"loss": 1.4228, "grad_norm": 0.7028461694717407, "learning_rate": 0.0002, "epoch": 3.5079549864183157, "step": 4520}, {"loss": 1.4723, "grad_norm": 0.8056529760360718, "learning_rate": 0.0002, "epoch": 3.5157159487776486, "step": 4530}, {"loss": 1.4148, "grad_norm": 0.711338996887207, "learning_rate": 0.0002, "epoch": 3.523476911136981, "step": 4540}, {"loss": 1.5247, "grad_norm": 0.7343552708625793, "learning_rate": 0.0002, "epoch": 3.5312378734963135, "step": 4550}, {"loss": 1.4308, "grad_norm": 0.745479941368103, "learning_rate": 0.0002, "epoch": 3.5389988358556463, "step": 4560}, {"loss": 1.4229, "grad_norm": 0.7582294940948486, "learning_rate": 0.0002, "epoch": 3.5467597982149788, "step": 4570}, {"loss": 1.4127, "grad_norm": 0.6717444658279419, "learning_rate": 0.0002, "epoch": 3.554520760574311, "step": 4580}, {"loss": 1.4368, "grad_norm": 0.7417883276939392, "learning_rate": 0.0002, "epoch": 3.5622817229336436, "step": 4590}, {"loss": 1.4176, "grad_norm": 0.6385737061500549, "learning_rate": 0.0002, "epoch": 3.570042685292976, "step": 4600}, {"loss": 1.3981, "grad_norm": 0.716704249382019, "learning_rate": 0.0002, "epoch": 3.577803647652309, "step": 4610}, {"loss": 1.3889, "grad_norm": 0.6948980093002319, "learning_rate": 0.0002, "epoch": 3.5855646100116414, "step": 4620}, {"loss": 1.5177, "grad_norm": 0.6961140036582947, "learning_rate": 0.0002, "epoch": 3.593325572370974, "step": 4630}, {"loss": 1.4508, "grad_norm": 0.7493122220039368, "learning_rate": 0.0002, "epoch": 3.6010865347303067, "step": 4640}, {"loss": 1.3987, "grad_norm": 0.7431658506393433, "learning_rate": 0.0002, "epoch": 3.608847497089639, "step": 4650}, {"loss": 1.4551, "grad_norm": 0.8353387713432312, "learning_rate": 0.0002, "epoch": 3.6166084594489716, "step": 4660}, {"loss": 1.4533, "grad_norm": 0.7095612287521362, "learning_rate": 0.0002, "epoch": 3.6243694218083045, "step": 4670}, {"loss": 1.4003, "grad_norm": 0.776620090007782, "learning_rate": 0.0002, "epoch": 3.632130384167637, "step": 4680}, {"loss": 1.4361, "grad_norm": 0.7198925018310547, "learning_rate": 0.0002, "epoch": 3.6398913465269693, "step": 4690}, {"loss": 1.4543, "grad_norm": 0.8238834738731384, "learning_rate": 0.0002, "epoch": 3.6476523088863018, "step": 4700}, {"loss": 1.3958, "grad_norm": 0.6804245710372925, "learning_rate": 0.0002, "epoch": 3.655413271245634, "step": 4710}, {"loss": 1.4158, "grad_norm": 0.8444845676422119, "learning_rate": 0.0002, "epoch": 3.663174233604967, "step": 4720}, {"loss": 1.3825, "grad_norm": 0.743797779083252, "learning_rate": 0.0002, "epoch": 3.6709351959642995, "step": 4730}, {"loss": 1.4213, "grad_norm": 0.8994188904762268, "learning_rate": 0.0002, "epoch": 3.678696158323632, "step": 4740}, {"loss": 1.4281, "grad_norm": 0.75416100025177, "learning_rate": 0.0002, "epoch": 3.686457120682965, "step": 4750}, {"loss": 1.4154, "grad_norm": 0.6499266028404236, "learning_rate": 0.0002, "epoch": 3.6942180830422973, "step": 4760}, {"loss": 1.4005, "grad_norm": 0.7246791124343872, "learning_rate": 0.0002, "epoch": 3.7019790454016297, "step": 4770}, {"loss": 1.426, "grad_norm": 0.7831124067306519, "learning_rate": 0.0002, "epoch": 3.7097400077609626, "step": 4780}, {"loss": 1.3933, "grad_norm": 0.7130028009414673, "learning_rate": 0.0002, "epoch": 3.717500970120295, "step": 4790}, {"loss": 1.4632, "grad_norm": 0.7501602172851562, "learning_rate": 0.0002, "epoch": 3.7252619324796274, "step": 4800}, {"loss": 1.4985, "grad_norm": 0.6980932950973511, "learning_rate": 0.0002, "epoch": 3.73302289483896, "step": 4810}, {"loss": 1.4517, "grad_norm": 0.8050530552864075, "learning_rate": 0.0002, "epoch": 3.7407838571982923, "step": 4820}, {"loss": 1.4703, "grad_norm": 0.6385579705238342, "learning_rate": 0.0002, "epoch": 3.748544819557625, "step": 4830}, {"loss": 1.5281, "grad_norm": 0.6664714813232422, "learning_rate": 0.0002, "epoch": 3.7563057819169576, "step": 4840}, {"loss": 1.4443, "grad_norm": 0.7125676274299622, "learning_rate": 0.0002, "epoch": 3.76406674427629, "step": 4850}, {"loss": 1.3958, "grad_norm": 0.7231866717338562, "learning_rate": 0.0002, "epoch": 3.771827706635623, "step": 4860}, {"loss": 1.4446, "grad_norm": 0.6917183995246887, "learning_rate": 0.0002, "epoch": 3.7795886689949554, "step": 4870}, {"loss": 1.4369, "grad_norm": 0.665037989616394, "learning_rate": 0.0002, "epoch": 3.787349631354288, "step": 4880}, {"loss": 1.4193, "grad_norm": 0.5837726593017578, "learning_rate": 0.0002, "epoch": 3.7951105937136207, "step": 4890}, {"loss": 1.4176, "grad_norm": 0.6366701722145081, "learning_rate": 0.0002, "epoch": 3.802871556072953, "step": 4900}, {"loss": 1.46, "grad_norm": 0.7082223892211914, "learning_rate": 0.0002, "epoch": 3.8106325184322856, "step": 4910}, {"loss": 1.5139, "grad_norm": 0.8101672530174255, "learning_rate": 0.0002, "epoch": 3.818393480791618, "step": 4920}, {"loss": 1.3659, "grad_norm": 0.7516148090362549, "learning_rate": 0.0002, "epoch": 3.826154443150951, "step": 4930}, {"loss": 1.3909, "grad_norm": 0.7928489446640015, "learning_rate": 0.0002, "epoch": 3.8339154055102833, "step": 4940}, {"loss": 1.4255, "grad_norm": 0.6892234683036804, "learning_rate": 0.0002, "epoch": 3.8416763678696157, "step": 4950}, {"loss": 1.5024, "grad_norm": 0.6381304264068604, "learning_rate": 0.0002, "epoch": 3.849437330228948, "step": 4960}, {"loss": 1.4873, "grad_norm": 0.8068831562995911, "learning_rate": 0.0002, "epoch": 3.857198292588281, "step": 4970}, {"loss": 1.45, "grad_norm": 0.7289869785308838, "learning_rate": 0.0002, "epoch": 3.8649592549476135, "step": 4980}, {"loss": 1.398, "grad_norm": 0.7278549075126648, "learning_rate": 0.0002, "epoch": 3.872720217306946, "step": 4990}, {"loss": 1.4442, "grad_norm": 0.7324236631393433, "learning_rate": 0.0002, "epoch": 3.880481179666279, "step": 5000}, {"loss": 1.4511, "grad_norm": 0.6759871244430542, "learning_rate": 0.0002, "epoch": 3.8882421420256112, "step": 5010}, {"loss": 1.4705, "grad_norm": 0.8159207701683044, "learning_rate": 0.0002, "epoch": 3.8960031043849437, "step": 5020}, {"loss": 1.4685, "grad_norm": 0.6536211967468262, "learning_rate": 0.0002, "epoch": 3.9037640667442766, "step": 5030}, {"loss": 1.4335, "grad_norm": 0.6827932000160217, "learning_rate": 0.0002, "epoch": 3.911525029103609, "step": 5040}, {"loss": 1.433, "grad_norm": 0.6688340306282043, "learning_rate": 0.0002, "epoch": 3.9192859914629414, "step": 5050}, {"loss": 1.4099, "grad_norm": 0.6385695934295654, "learning_rate": 0.0002, "epoch": 3.927046953822274, "step": 5060}, {"loss": 1.4767, "grad_norm": 0.6975107192993164, "learning_rate": 0.0002, "epoch": 3.9348079161816063, "step": 5070}, {"loss": 1.4893, "grad_norm": 0.6684112548828125, "learning_rate": 0.0002, "epoch": 3.942568878540939, "step": 5080}, {"loss": 1.4732, "grad_norm": 0.8349628448486328, "learning_rate": 0.0002, "epoch": 3.9503298409002716, "step": 5090}, {"loss": 1.5131, "grad_norm": 0.7146425843238831, "learning_rate": 0.0002, "epoch": 3.958090803259604, "step": 5100}, {"loss": 1.4149, "grad_norm": 0.6555036902427673, "learning_rate": 0.0002, "epoch": 3.965851765618937, "step": 5110}, {"loss": 1.4274, "grad_norm": 0.7037415504455566, "learning_rate": 0.0002, "epoch": 3.9736127279782694, "step": 5120}, {"loss": 1.4292, "grad_norm": 0.7235575914382935, "learning_rate": 0.0002, "epoch": 3.981373690337602, "step": 5130}, {"loss": 1.4455, "grad_norm": 0.7092325687408447, "learning_rate": 0.0002, "epoch": 3.9891346526969347, "step": 5140}, {"loss": 1.4512, "grad_norm": 0.7490319609642029, "learning_rate": 0.0002, "epoch": 3.996895615056267, "step": 5150}, {"eval_loss": 1.9131355285644531, "eval_runtime": 105.5778, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.606, "epoch": 4.0, "step": 5154}, {"loss": 1.2643, "grad_norm": 0.7075854539871216, "learning_rate": 0.0002, "epoch": 4.0046565774155995, "step": 5160}, {"loss": 1.209, "grad_norm": 0.9466007351875305, "learning_rate": 0.0002, "epoch": 4.012417539774932, "step": 5170}, {"loss": 1.2567, "grad_norm": 1.0297044515609741, "learning_rate": 0.0002, "epoch": 4.020178502134264, "step": 5180}, {"loss": 1.1796, "grad_norm": 0.7765059471130371, "learning_rate": 0.0002, "epoch": 4.027939464493597, "step": 5190}, {"loss": 1.2356, "grad_norm": 0.995760977268219, "learning_rate": 0.0002, "epoch": 4.03570042685293, "step": 5200}, {"loss": 1.1792, "grad_norm": 0.8663829565048218, "learning_rate": 0.0002, "epoch": 4.043461389212262, "step": 5210}, {"loss": 1.2471, "grad_norm": 1.0660825967788696, "learning_rate": 0.0002, "epoch": 4.051222351571595, "step": 5220}, {"loss": 1.1676, "grad_norm": 0.9858174920082092, "learning_rate": 0.0002, "epoch": 4.058983313930927, "step": 5230}, {"loss": 1.2448, "grad_norm": 0.8911338448524475, "learning_rate": 0.0002, "epoch": 4.06674427629026, "step": 5240}, {"loss": 1.1858, "grad_norm": 1.0848394632339478, "learning_rate": 0.0002, "epoch": 4.074505238649593, "step": 5250}, {"loss": 1.1684, "grad_norm": 1.0849905014038086, "learning_rate": 0.0002, "epoch": 4.082266201008925, "step": 5260}, {"loss": 1.2007, "grad_norm": 1.0497841835021973, "learning_rate": 0.0002, "epoch": 4.090027163368258, "step": 5270}, {"loss": 1.2552, "grad_norm": 0.8943053483963013, "learning_rate": 0.0002, "epoch": 4.0977881257275905, "step": 5280}, {"loss": 1.1923, "grad_norm": 0.8432527184486389, "learning_rate": 0.0002, "epoch": 4.1055490880869225, "step": 5290}, {"loss": 1.1634, "grad_norm": 0.9690414667129517, "learning_rate": 0.0002, "epoch": 4.113310050446255, "step": 5300}, {"loss": 1.3019, "grad_norm": 0.7790773510932922, "learning_rate": 0.0002, "epoch": 4.121071012805588, "step": 5310}, {"loss": 1.1806, "grad_norm": 0.9289211630821228, "learning_rate": 0.0002, "epoch": 4.12883197516492, "step": 5320}, {"loss": 1.1458, "grad_norm": 1.0785125494003296, "learning_rate": 0.0002, "epoch": 4.136592937524253, "step": 5330}, {"loss": 1.2086, "grad_norm": 0.8559591770172119, "learning_rate": 0.0002, "epoch": 4.144353899883585, "step": 5340}, {"loss": 1.1974, "grad_norm": 0.9405956268310547, "learning_rate": 0.0002, "epoch": 4.152114862242918, "step": 5350}, {"loss": 1.1793, "grad_norm": 0.9942827820777893, "learning_rate": 0.0002, "epoch": 4.159875824602251, "step": 5360}, {"loss": 1.1659, "grad_norm": 0.9141933917999268, "learning_rate": 0.0002, "epoch": 4.167636786961583, "step": 5370}, {"loss": 1.1647, "grad_norm": 0.8206015229225159, "learning_rate": 0.0002, "epoch": 4.175397749320916, "step": 5380}, {"loss": 1.2778, "grad_norm": 0.9340888857841492, "learning_rate": 0.0002, "epoch": 4.183158711680249, "step": 5390}, {"loss": 1.2459, "grad_norm": 1.2122114896774292, "learning_rate": 0.0002, "epoch": 4.190919674039581, "step": 5400}, {"loss": 1.2371, "grad_norm": 1.0661298036575317, "learning_rate": 0.0002, "epoch": 4.1986806363989135, "step": 5410}, {"loss": 1.1978, "grad_norm": 0.9372861385345459, "learning_rate": 0.0002, "epoch": 4.206441598758246, "step": 5420}, {"loss": 1.2653, "grad_norm": 0.894012987613678, "learning_rate": 0.0002, "epoch": 4.214202561117578, "step": 5430}, {"loss": 1.387, "grad_norm": 1.0647753477096558, "learning_rate": 0.0002, "epoch": 4.221963523476911, "step": 5440}, {"loss": 1.2231, "grad_norm": 0.989179790019989, "learning_rate": 0.0002, "epoch": 4.229724485836243, "step": 5450}, {"loss": 1.2715, "grad_norm": 1.1601181030273438, "learning_rate": 0.0002, "epoch": 4.237485448195576, "step": 5460}, {"loss": 1.2406, "grad_norm": 0.9395585656166077, "learning_rate": 0.0002, "epoch": 4.245246410554909, "step": 5470}, {"loss": 1.2779, "grad_norm": 0.9527766108512878, "learning_rate": 0.0002, "epoch": 4.253007372914241, "step": 5480}, {"loss": 1.267, "grad_norm": 1.0319520235061646, "learning_rate": 0.0002, "epoch": 4.260768335273574, "step": 5490}, {"loss": 1.2633, "grad_norm": 0.8659824728965759, "learning_rate": 0.0002, "epoch": 4.268529297632907, "step": 5500}, {"loss": 1.1475, "grad_norm": 1.099211573600769, "learning_rate": 0.0002, "epoch": 4.276290259992239, "step": 5510}, {"loss": 1.2508, "grad_norm": 0.9363361597061157, "learning_rate": 0.0002, "epoch": 4.284051222351572, "step": 5520}, {"loss": 1.189, "grad_norm": 0.8437647223472595, "learning_rate": 0.0002, "epoch": 4.2918121847109045, "step": 5530}, {"loss": 1.2212, "grad_norm": 0.9181258678436279, "learning_rate": 0.0002, "epoch": 4.2995731470702365, "step": 5540}, {"loss": 1.2092, "grad_norm": 0.9059357643127441, "learning_rate": 0.0002, "epoch": 4.307334109429569, "step": 5550}, {"loss": 1.2189, "grad_norm": 0.9337241649627686, "learning_rate": 0.0002, "epoch": 4.315095071788901, "step": 5560}, {"loss": 1.2462, "grad_norm": 0.9428889155387878, "learning_rate": 0.0002, "epoch": 4.322856034148234, "step": 5570}, {"loss": 1.2675, "grad_norm": 1.003589153289795, "learning_rate": 0.0002, "epoch": 4.330616996507567, "step": 5580}, {"loss": 1.2703, "grad_norm": 1.1249268054962158, "learning_rate": 0.0002, "epoch": 4.338377958866899, "step": 5590}, {"loss": 1.2501, "grad_norm": 0.8623469471931458, "learning_rate": 0.0002, "epoch": 4.346138921226232, "step": 5600}, {"loss": 1.2404, "grad_norm": 1.1389174461364746, "learning_rate": 0.0002, "epoch": 4.353899883585565, "step": 5610}, {"loss": 1.2245, "grad_norm": 1.0136264562606812, "learning_rate": 0.0002, "epoch": 4.361660845944897, "step": 5620}, {"loss": 1.3473, "grad_norm": 0.9567070603370667, "learning_rate": 0.0002, "epoch": 4.36942180830423, "step": 5630}, {"loss": 1.2988, "grad_norm": 1.0592148303985596, "learning_rate": 0.0002, "epoch": 4.377182770663563, "step": 5640}, {"loss": 1.212, "grad_norm": 1.0110485553741455, "learning_rate": 0.0002, "epoch": 4.384943733022895, "step": 5650}, {"loss": 1.2086, "grad_norm": 0.9914907217025757, "learning_rate": 0.0002, "epoch": 4.3927046953822275, "step": 5660}, {"loss": 1.2363, "grad_norm": 0.9447247982025146, "learning_rate": 0.0002, "epoch": 4.4004656577415595, "step": 5670}, {"loss": 1.2617, "grad_norm": 0.9644378423690796, "learning_rate": 0.0002, "epoch": 4.408226620100892, "step": 5680}, {"loss": 1.2773, "grad_norm": 0.920676589012146, "learning_rate": 0.0002, "epoch": 4.415987582460225, "step": 5690}, {"loss": 1.2792, "grad_norm": 1.060570478439331, "learning_rate": 0.0002, "epoch": 4.423748544819557, "step": 5700}, {"loss": 1.2374, "grad_norm": 0.8857738971710205, "learning_rate": 0.0002, "epoch": 4.43150950717889, "step": 5710}, {"loss": 1.2588, "grad_norm": 1.0536398887634277, "learning_rate": 0.0002, "epoch": 4.439270469538223, "step": 5720}, {"loss": 1.2051, "grad_norm": 0.990847110748291, "learning_rate": 0.0002, "epoch": 4.447031431897555, "step": 5730}, {"loss": 1.2469, "grad_norm": 0.9692499041557312, "learning_rate": 0.0002, "epoch": 4.454792394256888, "step": 5740}, {"loss": 1.2269, "grad_norm": 1.0376402139663696, "learning_rate": 0.0002, "epoch": 4.462553356616221, "step": 5750}, {"loss": 1.1701, "grad_norm": 1.3863259553909302, "learning_rate": 0.0002, "epoch": 4.470314318975553, "step": 5760}, {"loss": 1.2591, "grad_norm": 0.978379487991333, "learning_rate": 0.0002, "epoch": 4.478075281334886, "step": 5770}, {"loss": 1.2729, "grad_norm": 1.0973085165023804, "learning_rate": 0.0002, "epoch": 4.485836243694218, "step": 5780}, {"loss": 1.2404, "grad_norm": 1.057006597518921, "learning_rate": 0.0002, "epoch": 4.4935972060535505, "step": 5790}, {"loss": 1.2476, "grad_norm": 0.9247729182243347, "learning_rate": 0.0002, "epoch": 4.501358168412883, "step": 5800}, {"loss": 1.2369, "grad_norm": 1.0447787046432495, "learning_rate": 0.0002, "epoch": 4.509119130772215, "step": 5810}, {"loss": 1.211, "grad_norm": 1.1930429935455322, "learning_rate": 0.0002, "epoch": 4.516880093131548, "step": 5820}, {"loss": 1.2596, "grad_norm": 0.9867590069770813, "learning_rate": 0.0002, "epoch": 4.524641055490881, "step": 5830}, {"loss": 1.2766, "grad_norm": 0.9591100215911865, "learning_rate": 0.0002, "epoch": 4.532402017850213, "step": 5840}, {"loss": 1.2154, "grad_norm": 0.9950753450393677, "learning_rate": 0.0002, "epoch": 4.540162980209546, "step": 5850}, {"loss": 1.2149, "grad_norm": 1.0087506771087646, "learning_rate": 0.0002, "epoch": 4.547923942568879, "step": 5860}, {"loss": 1.3165, "grad_norm": 1.0934417247772217, "learning_rate": 0.0002, "epoch": 4.555684904928211, "step": 5870}, {"loss": 1.3059, "grad_norm": 1.107987403869629, "learning_rate": 0.0002, "epoch": 4.563445867287544, "step": 5880}, {"loss": 1.2184, "grad_norm": 0.9147276878356934, "learning_rate": 0.0002, "epoch": 4.571206829646876, "step": 5890}, {"loss": 1.24, "grad_norm": 1.036780595779419, "learning_rate": 0.0002, "epoch": 4.578967792006209, "step": 5900}, {"loss": 1.2209, "grad_norm": 0.9284719824790955, "learning_rate": 0.0002, "epoch": 4.5867287543655415, "step": 5910}, {"loss": 1.3693, "grad_norm": 0.9141898155212402, "learning_rate": 0.0002, "epoch": 4.5944897167248735, "step": 5920}, {"loss": 1.2319, "grad_norm": 1.0447357892990112, "learning_rate": 0.0002, "epoch": 4.602250679084206, "step": 5930}, {"loss": 1.2667, "grad_norm": 0.9309114217758179, "learning_rate": 0.0002, "epoch": 4.610011641443539, "step": 5940}, {"loss": 1.2827, "grad_norm": 1.2986129522323608, "learning_rate": 0.0002, "epoch": 4.617772603802871, "step": 5950}, {"loss": 1.312, "grad_norm": 0.9221704602241516, "learning_rate": 0.0002, "epoch": 4.625533566162204, "step": 5960}, {"loss": 1.2769, "grad_norm": 0.9228187799453735, "learning_rate": 0.0002, "epoch": 4.633294528521537, "step": 5970}, {"loss": 1.2953, "grad_norm": 0.9483116269111633, "learning_rate": 0.0002, "epoch": 4.641055490880869, "step": 5980}, {"loss": 1.3437, "grad_norm": 1.0218974351882935, "learning_rate": 0.0002, "epoch": 4.648816453240202, "step": 5990}, {"loss": 1.3085, "grad_norm": 0.9764600396156311, "learning_rate": 0.0002, "epoch": 4.656577415599534, "step": 6000}, {"loss": 1.197, "grad_norm": 0.9115710258483887, "learning_rate": 0.0002, "epoch": 4.664338377958867, "step": 6010}, {"loss": 1.1917, "grad_norm": 0.9245651364326477, "learning_rate": 0.0002, "epoch": 4.6720993403182, "step": 6020}, {"loss": 1.2969, "grad_norm": 0.9686311483383179, "learning_rate": 0.0002, "epoch": 4.6798603026775325, "step": 6030}, {"loss": 1.2702, "grad_norm": 1.1807392835617065, "learning_rate": 0.0002, "epoch": 4.6876212650368645, "step": 6040}, {"loss": 1.328, "grad_norm": 1.0358641147613525, "learning_rate": 0.0002, "epoch": 4.695382227396197, "step": 6050}, {"loss": 1.3281, "grad_norm": 0.987332284450531, "learning_rate": 0.0002, "epoch": 4.703143189755529, "step": 6060}, {"loss": 1.2514, "grad_norm": 1.0526494979858398, "learning_rate": 0.0002, "epoch": 4.710904152114862, "step": 6070}, {"loss": 1.2246, "grad_norm": 1.0276758670806885, "learning_rate": 0.0002, "epoch": 4.718665114474195, "step": 6080}, {"loss": 1.3367, "grad_norm": 0.9904406666755676, "learning_rate": 0.0002, "epoch": 4.726426076833527, "step": 6090}, {"loss": 1.2797, "grad_norm": 1.0084882974624634, "learning_rate": 0.0002, "epoch": 4.73418703919286, "step": 6100}, {"loss": 1.2656, "grad_norm": 0.8646450638771057, "learning_rate": 0.0002, "epoch": 4.741948001552192, "step": 6110}, {"loss": 1.3063, "grad_norm": 0.9233377575874329, "learning_rate": 0.0002, "epoch": 4.749708963911525, "step": 6120}, {"loss": 1.2642, "grad_norm": 0.9675140976905823, "learning_rate": 0.0002, "epoch": 4.757469926270858, "step": 6130}, {"loss": 1.3367, "grad_norm": 0.9639796018600464, "learning_rate": 0.0002, "epoch": 4.765230888630191, "step": 6140}, {"loss": 1.276, "grad_norm": 0.925199568271637, "learning_rate": 0.0002, "epoch": 4.772991850989523, "step": 6150}, {"loss": 1.2441, "grad_norm": 1.050901174545288, "learning_rate": 0.0002, "epoch": 4.7807528133488555, "step": 6160}, {"loss": 1.301, "grad_norm": 0.8920623660087585, "learning_rate": 0.0002, "epoch": 4.7885137757081875, "step": 6170}, {"loss": 1.263, "grad_norm": 0.8964757919311523, "learning_rate": 0.0002, "epoch": 4.79627473806752, "step": 6180}, {"loss": 1.2787, "grad_norm": 1.0839070081710815, "learning_rate": 0.0002, "epoch": 4.804035700426853, "step": 6190}, {"loss": 1.2664, "grad_norm": 0.8809942007064819, "learning_rate": 0.0002, "epoch": 4.811796662786185, "step": 6200}, {"loss": 1.321, "grad_norm": 1.0216195583343506, "learning_rate": 0.0002, "epoch": 4.819557625145518, "step": 6210}, {"loss": 1.3033, "grad_norm": 0.892005980014801, "learning_rate": 0.0002, "epoch": 4.827318587504851, "step": 6220}, {"loss": 1.2602, "grad_norm": 0.9957166910171509, "learning_rate": 0.0002, "epoch": 4.835079549864183, "step": 6230}, {"loss": 1.3562, "grad_norm": 0.9720533490180969, "learning_rate": 0.0002, "epoch": 4.842840512223516, "step": 6240}, {"loss": 1.2651, "grad_norm": 0.9336182475090027, "learning_rate": 0.0002, "epoch": 4.850601474582849, "step": 6250}, {"loss": 1.3136, "grad_norm": 1.2611457109451294, "learning_rate": 0.0002, "epoch": 4.858362436942181, "step": 6260}, {"loss": 1.2234, "grad_norm": 0.8927203416824341, "learning_rate": 0.0002, "epoch": 4.866123399301514, "step": 6270}, {"loss": 1.3463, "grad_norm": 0.9706710577011108, "learning_rate": 0.0002, "epoch": 4.873884361660846, "step": 6280}, {"loss": 1.3209, "grad_norm": 1.1461690664291382, "learning_rate": 0.0002, "epoch": 4.8816453240201785, "step": 6290}, {"loss": 1.2566, "grad_norm": 0.9930381178855896, "learning_rate": 0.0002, "epoch": 4.889406286379511, "step": 6300}, {"loss": 1.2568, "grad_norm": 0.91451096534729, "learning_rate": 0.0002, "epoch": 4.897167248738843, "step": 6310}, {"loss": 1.2836, "grad_norm": 1.0319571495056152, "learning_rate": 0.0002, "epoch": 4.904928211098176, "step": 6320}, {"loss": 1.2908, "grad_norm": 0.990140438079834, "learning_rate": 0.0002, "epoch": 4.912689173457509, "step": 6330}, {"loss": 1.3299, "grad_norm": 1.2466117143630981, "learning_rate": 0.0002, "epoch": 4.920450135816841, "step": 6340}, {"loss": 1.2659, "grad_norm": 1.0316979885101318, "learning_rate": 0.0002, "epoch": 4.928211098176174, "step": 6350}, {"loss": 1.3292, "grad_norm": 1.0643759965896606, "learning_rate": 0.0002, "epoch": 4.935972060535507, "step": 6360}, {"loss": 1.2559, "grad_norm": 0.9703279733657837, "learning_rate": 0.0002, "epoch": 4.943733022894839, "step": 6370}, {"loss": 1.2155, "grad_norm": 0.9767927527427673, "learning_rate": 0.0002, "epoch": 4.951493985254172, "step": 6380}, {"loss": 1.2437, "grad_norm": 0.960854172706604, "learning_rate": 0.0002, "epoch": 4.959254947613504, "step": 6390}, {"loss": 1.3314, "grad_norm": 0.9922910332679749, "learning_rate": 0.0002, "epoch": 4.967015909972837, "step": 6400}, {"loss": 1.3018, "grad_norm": 0.956470787525177, "learning_rate": 0.0002, "epoch": 4.9747768723321695, "step": 6410}, {"loss": 1.2794, "grad_norm": 0.9637242555618286, "learning_rate": 0.0002, "epoch": 4.9825378346915015, "step": 6420}, {"loss": 1.3236, "grad_norm": 1.0855202674865723, "learning_rate": 0.0002, "epoch": 4.990298797050834, "step": 6430}, {"loss": 1.3015, "grad_norm": 0.9655316472053528, "learning_rate": 0.0002, "epoch": 4.998059759410167, "step": 6440}]} +{"epoch": 6.0, "step": 7731, "epoch_duration": 3940.1947734355927, "total_accumulated_duration": 21971.06501841545, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0855, "grad_norm": 1.0751162767410278, "learning_rate": 0.0002, "epoch": 0.007760962359332557, "step": 10}, {"loss": 2.4744, "grad_norm": 0.4697345793247223, "learning_rate": 0.0002, "epoch": 0.015521924718665115, "step": 20}, {"loss": 2.193, "grad_norm": 0.5370839238166809, "learning_rate": 0.0002, "epoch": 0.023282887077997673, "step": 30}, {"loss": 2.0599, "grad_norm": 0.46794816851615906, "learning_rate": 0.0002, "epoch": 0.03104384943733023, "step": 40}, {"loss": 1.9354, "grad_norm": 0.44624820351600647, "learning_rate": 0.0002, "epoch": 0.038804811796662786, "step": 50}, {"loss": 1.9319, "grad_norm": 0.3953201472759247, "learning_rate": 0.0002, "epoch": 0.046565774155995346, "step": 60}, {"loss": 1.9099, "grad_norm": 0.3935912549495697, "learning_rate": 0.0002, "epoch": 0.0543267365153279, "step": 70}, {"loss": 1.8795, "grad_norm": 0.4520699381828308, "learning_rate": 0.0002, "epoch": 0.06208769887466046, "step": 80}, {"loss": 1.8354, "grad_norm": 0.3801847994327545, "learning_rate": 0.0002, "epoch": 0.06984866123399301, "step": 90}, {"loss": 1.9053, "grad_norm": 0.4020165205001831, "learning_rate": 0.0002, "epoch": 0.07760962359332557, "step": 100}, {"loss": 1.8779, "grad_norm": 0.3860672116279602, "learning_rate": 0.0002, "epoch": 0.08537058595265813, "step": 110}, {"loss": 1.8731, "grad_norm": 0.3681113123893738, "learning_rate": 0.0002, "epoch": 0.09313154831199069, "step": 120}, {"loss": 1.8157, "grad_norm": 0.3594866991043091, "learning_rate": 0.0002, "epoch": 0.10089251067132324, "step": 130}, {"loss": 1.8266, "grad_norm": 0.3879193663597107, "learning_rate": 0.0002, "epoch": 0.1086534730306558, "step": 140}, {"loss": 1.8818, "grad_norm": 0.3270505666732788, "learning_rate": 0.0002, "epoch": 0.11641443538998836, "step": 150}, {"loss": 1.87, "grad_norm": 0.36824458837509155, "learning_rate": 0.0002, "epoch": 0.12417539774932092, "step": 160}, {"loss": 1.8305, "grad_norm": 0.383882075548172, "learning_rate": 0.0002, "epoch": 0.13193636010865348, "step": 170}, {"loss": 1.8584, "grad_norm": 0.3368665874004364, "learning_rate": 0.0002, "epoch": 0.13969732246798602, "step": 180}, {"loss": 1.7882, "grad_norm": 0.35961097478866577, "learning_rate": 0.0002, "epoch": 0.1474582848273186, "step": 190}, {"loss": 1.8467, "grad_norm": 0.3415963351726532, "learning_rate": 0.0002, "epoch": 0.15521924718665114, "step": 200}, {"loss": 1.8543, "grad_norm": 0.4100632071495056, "learning_rate": 0.0002, "epoch": 0.1629802095459837, "step": 210}, {"loss": 1.8226, "grad_norm": 0.3516307473182678, "learning_rate": 0.0002, "epoch": 0.17074117190531626, "step": 220}, {"loss": 1.7386, "grad_norm": 0.37919050455093384, "learning_rate": 0.0002, "epoch": 0.1785021342646488, "step": 230}, {"loss": 1.7937, "grad_norm": 0.33270683884620667, "learning_rate": 0.0002, "epoch": 0.18626309662398138, "step": 240}, {"loss": 1.7925, "grad_norm": 0.3348783254623413, "learning_rate": 0.0002, "epoch": 0.19402405898331393, "step": 250}, {"loss": 1.7774, "grad_norm": 0.3888475298881531, "learning_rate": 0.0002, "epoch": 0.20178502134264648, "step": 260}, {"loss": 1.8381, "grad_norm": 0.3554602861404419, "learning_rate": 0.0002, "epoch": 0.20954598370197905, "step": 270}, {"loss": 1.8359, "grad_norm": 0.33277708292007446, "learning_rate": 0.0002, "epoch": 0.2173069460613116, "step": 280}, {"loss": 1.7713, "grad_norm": 0.3281584680080414, "learning_rate": 0.0002, "epoch": 0.22506790842064417, "step": 290}, {"loss": 1.8181, "grad_norm": 0.3185969591140747, "learning_rate": 0.0002, "epoch": 0.23282887077997672, "step": 300}, {"loss": 1.8595, "grad_norm": 0.35335442423820496, "learning_rate": 0.0002, "epoch": 0.24058983313930926, "step": 310}, {"loss": 1.87, "grad_norm": 0.3119595944881439, "learning_rate": 0.0002, "epoch": 0.24835079549864184, "step": 320}, {"loss": 1.8357, "grad_norm": 0.36424458026885986, "learning_rate": 0.0002, "epoch": 0.2561117578579744, "step": 330}, {"loss": 1.8003, "grad_norm": 0.3618951141834259, "learning_rate": 0.0002, "epoch": 0.26387272021730696, "step": 340}, {"loss": 1.8221, "grad_norm": 0.312757670879364, "learning_rate": 0.0002, "epoch": 0.2716336825766395, "step": 350}, {"loss": 1.9031, "grad_norm": 0.326016366481781, "learning_rate": 0.0002, "epoch": 0.27939464493597205, "step": 360}, {"loss": 1.8214, "grad_norm": 0.34093883633613586, "learning_rate": 0.0002, "epoch": 0.2871556072953046, "step": 370}, {"loss": 1.7733, "grad_norm": 0.32325029373168945, "learning_rate": 0.0002, "epoch": 0.2949165696546372, "step": 380}, {"loss": 1.842, "grad_norm": 0.34105437994003296, "learning_rate": 0.0002, "epoch": 0.30267753201396974, "step": 390}, {"loss": 1.7926, "grad_norm": 0.32565295696258545, "learning_rate": 0.0002, "epoch": 0.3104384943733023, "step": 400}, {"loss": 1.8031, "grad_norm": 0.32742050290107727, "learning_rate": 0.0002, "epoch": 0.31819945673263483, "step": 410}, {"loss": 1.907, "grad_norm": 0.30233046412467957, "learning_rate": 0.0002, "epoch": 0.3259604190919674, "step": 420}, {"loss": 1.7623, "grad_norm": 0.32419222593307495, "learning_rate": 0.0002, "epoch": 0.3337213814513, "step": 430}, {"loss": 1.865, "grad_norm": 0.3653007745742798, "learning_rate": 0.0002, "epoch": 0.3414823438106325, "step": 440}, {"loss": 1.8044, "grad_norm": 0.31617099046707153, "learning_rate": 0.0002, "epoch": 0.3492433061699651, "step": 450}, {"loss": 1.7677, "grad_norm": 0.3305962085723877, "learning_rate": 0.0002, "epoch": 0.3570042685292976, "step": 460}, {"loss": 1.8155, "grad_norm": 0.3178933262825012, "learning_rate": 0.0002, "epoch": 0.36476523088863017, "step": 470}, {"loss": 1.7485, "grad_norm": 0.37163782119750977, "learning_rate": 0.0002, "epoch": 0.37252619324796277, "step": 480}, {"loss": 1.8804, "grad_norm": 0.469844788312912, "learning_rate": 0.0002, "epoch": 0.3802871556072953, "step": 490}, {"loss": 1.8343, "grad_norm": 0.3409338593482971, "learning_rate": 0.0002, "epoch": 0.38804811796662786, "step": 500}, {"loss": 1.8433, "grad_norm": 0.31943467259407043, "learning_rate": 0.0002, "epoch": 0.3958090803259604, "step": 510}, {"loss": 1.7873, "grad_norm": 0.32293614745140076, "learning_rate": 0.0002, "epoch": 0.40357004268529295, "step": 520}, {"loss": 1.8584, "grad_norm": 0.2994382977485657, "learning_rate": 0.0002, "epoch": 0.41133100504462555, "step": 530}, {"loss": 1.8153, "grad_norm": 0.3273141384124756, "learning_rate": 0.0002, "epoch": 0.4190919674039581, "step": 540}, {"loss": 1.8097, "grad_norm": 0.3020550012588501, "learning_rate": 0.0002, "epoch": 0.42685292976329064, "step": 550}, {"loss": 1.8551, "grad_norm": 0.30113112926483154, "learning_rate": 0.0002, "epoch": 0.4346138921226232, "step": 560}, {"loss": 1.8084, "grad_norm": 0.30274903774261475, "learning_rate": 0.0002, "epoch": 0.44237485448195574, "step": 570}, {"loss": 1.7673, "grad_norm": 0.3231128454208374, "learning_rate": 0.0002, "epoch": 0.45013581684128834, "step": 580}, {"loss": 1.7848, "grad_norm": 0.3255121409893036, "learning_rate": 0.0002, "epoch": 0.4578967792006209, "step": 590}, {"loss": 1.8227, "grad_norm": 0.30147507786750793, "learning_rate": 0.0002, "epoch": 0.46565774155995343, "step": 600}, {"loss": 1.7572, "grad_norm": 0.29781386256217957, "learning_rate": 0.0002, "epoch": 0.473418703919286, "step": 610}, {"loss": 1.8307, "grad_norm": 0.30914685130119324, "learning_rate": 0.0002, "epoch": 0.4811796662786185, "step": 620}, {"loss": 1.805, "grad_norm": 0.3110593855381012, "learning_rate": 0.0002, "epoch": 0.4889406286379511, "step": 630}, {"loss": 1.8228, "grad_norm": 0.3298132121562958, "learning_rate": 0.0002, "epoch": 0.49670159099728367, "step": 640}, {"loss": 1.7816, "grad_norm": 0.322122186422348, "learning_rate": 0.0002, "epoch": 0.5044625533566163, "step": 650}, {"loss": 1.8001, "grad_norm": 0.3504371643066406, "learning_rate": 0.0002, "epoch": 0.5122235157159488, "step": 660}, {"loss": 1.8682, "grad_norm": 0.3102182149887085, "learning_rate": 0.0002, "epoch": 0.5199844780752814, "step": 670}, {"loss": 1.7494, "grad_norm": 0.6113658547401428, "learning_rate": 0.0002, "epoch": 0.5277454404346139, "step": 680}, {"loss": 1.7096, "grad_norm": 0.31841862201690674, "learning_rate": 0.0002, "epoch": 0.5355064027939465, "step": 690}, {"loss": 1.7587, "grad_norm": 0.2830526530742645, "learning_rate": 0.0002, "epoch": 0.543267365153279, "step": 700}, {"loss": 1.7887, "grad_norm": 0.3048769533634186, "learning_rate": 0.0002, "epoch": 0.5510283275126115, "step": 710}, {"loss": 1.8416, "grad_norm": 0.2719033658504486, "learning_rate": 0.0002, "epoch": 0.5587892898719441, "step": 720}, {"loss": 1.786, "grad_norm": 0.3176722526550293, "learning_rate": 0.0002, "epoch": 0.5665502522312766, "step": 730}, {"loss": 1.7127, "grad_norm": 0.32491734623908997, "learning_rate": 0.0002, "epoch": 0.5743112145906092, "step": 740}, {"loss": 1.7892, "grad_norm": 0.32746851444244385, "learning_rate": 0.0002, "epoch": 0.5820721769499418, "step": 750}, {"loss": 1.7811, "grad_norm": 0.3055773973464966, "learning_rate": 0.0002, "epoch": 0.5898331393092744, "step": 760}, {"loss": 1.8597, "grad_norm": 0.30671584606170654, "learning_rate": 0.0002, "epoch": 0.5975941016686069, "step": 770}, {"loss": 1.7728, "grad_norm": 0.28770264983177185, "learning_rate": 0.0002, "epoch": 0.6053550640279395, "step": 780}, {"loss": 1.7025, "grad_norm": 0.2814285457134247, "learning_rate": 0.0002, "epoch": 0.613116026387272, "step": 790}, {"loss": 1.819, "grad_norm": 0.31554412841796875, "learning_rate": 0.0002, "epoch": 0.6208769887466046, "step": 800}, {"loss": 1.8335, "grad_norm": 0.2984226942062378, "learning_rate": 0.0002, "epoch": 0.6286379511059371, "step": 810}, {"loss": 1.7728, "grad_norm": 0.2859906554222107, "learning_rate": 0.0002, "epoch": 0.6363989134652697, "step": 820}, {"loss": 1.7408, "grad_norm": 0.2887928783893585, "learning_rate": 0.0002, "epoch": 0.6441598758246022, "step": 830}, {"loss": 1.8071, "grad_norm": 0.31287339329719543, "learning_rate": 0.0002, "epoch": 0.6519208381839348, "step": 840}, {"loss": 1.7673, "grad_norm": 0.32064181566238403, "learning_rate": 0.0002, "epoch": 0.6596818005432674, "step": 850}, {"loss": 1.7849, "grad_norm": 0.290981650352478, "learning_rate": 0.0002, "epoch": 0.6674427629026, "step": 860}, {"loss": 1.8089, "grad_norm": 0.33060121536254883, "learning_rate": 0.0002, "epoch": 0.6752037252619325, "step": 870}, {"loss": 1.7357, "grad_norm": 0.27032899856567383, "learning_rate": 0.0002, "epoch": 0.682964687621265, "step": 880}, {"loss": 1.8423, "grad_norm": 0.29031234979629517, "learning_rate": 0.0002, "epoch": 0.6907256499805976, "step": 890}, {"loss": 1.835, "grad_norm": 0.2845142185688019, "learning_rate": 0.0002, "epoch": 0.6984866123399301, "step": 900}, {"loss": 1.77, "grad_norm": 0.8638312816619873, "learning_rate": 0.0002, "epoch": 0.7062475746992627, "step": 910}, {"loss": 1.7757, "grad_norm": 0.3086668848991394, "learning_rate": 0.0002, "epoch": 0.7140085370585952, "step": 920}, {"loss": 1.7541, "grad_norm": 0.2724177837371826, "learning_rate": 0.0002, "epoch": 0.7217694994179278, "step": 930}, {"loss": 1.816, "grad_norm": 0.289559006690979, "learning_rate": 0.0002, "epoch": 0.7295304617772603, "step": 940}, {"loss": 1.7654, "grad_norm": 0.3000658452510834, "learning_rate": 0.0002, "epoch": 0.737291424136593, "step": 950}, {"loss": 1.7736, "grad_norm": 0.33544042706489563, "learning_rate": 0.0002, "epoch": 0.7450523864959255, "step": 960}, {"loss": 1.6979, "grad_norm": 0.28593236207962036, "learning_rate": 0.0002, "epoch": 0.7528133488552581, "step": 970}, {"loss": 1.8583, "grad_norm": 0.313634991645813, "learning_rate": 0.0002, "epoch": 0.7605743112145906, "step": 980}, {"loss": 1.7473, "grad_norm": 0.2949385941028595, "learning_rate": 0.0002, "epoch": 0.7683352735739232, "step": 990}, {"loss": 1.8689, "grad_norm": 0.2920108437538147, "learning_rate": 0.0002, "epoch": 0.7760962359332557, "step": 1000}, {"loss": 1.8401, "grad_norm": 0.3245100677013397, "learning_rate": 0.0002, "epoch": 0.7838571982925883, "step": 1010}, {"loss": 1.7109, "grad_norm": 0.3007619380950928, "learning_rate": 0.0002, "epoch": 0.7916181606519208, "step": 1020}, {"loss": 1.7427, "grad_norm": 0.3630852997303009, "learning_rate": 0.0002, "epoch": 0.7993791230112534, "step": 1030}, {"loss": 1.7655, "grad_norm": 0.2856379747390747, "learning_rate": 0.0002, "epoch": 0.8071400853705859, "step": 1040}, {"loss": 1.8371, "grad_norm": 0.32476478815078735, "learning_rate": 0.0002, "epoch": 0.8149010477299186, "step": 1050}, {"loss": 1.8039, "grad_norm": 0.5162565112113953, "learning_rate": 0.0002, "epoch": 0.8226620100892511, "step": 1060}, {"loss": 1.8862, "grad_norm": 0.316496342420578, "learning_rate": 0.0002, "epoch": 0.8304229724485837, "step": 1070}, {"loss": 1.8023, "grad_norm": 0.31977516412734985, "learning_rate": 0.0002, "epoch": 0.8381839348079162, "step": 1080}, {"loss": 1.8547, "grad_norm": 0.269509494304657, "learning_rate": 0.0002, "epoch": 0.8459448971672487, "step": 1090}, {"loss": 1.7811, "grad_norm": 0.31621453166007996, "learning_rate": 0.0002, "epoch": 0.8537058595265813, "step": 1100}, {"loss": 1.739, "grad_norm": 0.2946535050868988, "learning_rate": 0.0002, "epoch": 0.8614668218859138, "step": 1110}, {"loss": 1.7511, "grad_norm": 0.3088909983634949, "learning_rate": 0.0002, "epoch": 0.8692277842452464, "step": 1120}, {"loss": 1.8228, "grad_norm": 0.33033716678619385, "learning_rate": 0.0002, "epoch": 0.8769887466045789, "step": 1130}, {"loss": 1.7912, "grad_norm": 0.2954833507537842, "learning_rate": 0.0002, "epoch": 0.8847497089639115, "step": 1140}, {"loss": 1.8394, "grad_norm": 0.2950248122215271, "learning_rate": 0.0002, "epoch": 0.8925106713232441, "step": 1150}, {"loss": 1.7068, "grad_norm": 0.296661913394928, "learning_rate": 0.0002, "epoch": 0.9002716336825767, "step": 1160}, {"loss": 1.7967, "grad_norm": 0.35451310873031616, "learning_rate": 0.0002, "epoch": 0.9080325960419092, "step": 1170}, {"loss": 1.8202, "grad_norm": 0.32705947756767273, "learning_rate": 0.0002, "epoch": 0.9157935584012418, "step": 1180}, {"loss": 1.7396, "grad_norm": 0.3333960771560669, "learning_rate": 0.0002, "epoch": 0.9235545207605743, "step": 1190}, {"loss": 1.7801, "grad_norm": 0.3042232096195221, "learning_rate": 0.0002, "epoch": 0.9313154831199069, "step": 1200}, {"loss": 1.7586, "grad_norm": 0.281553715467453, "learning_rate": 0.0002, "epoch": 0.9390764454792394, "step": 1210}, {"loss": 1.7953, "grad_norm": 0.3096391558647156, "learning_rate": 0.0002, "epoch": 0.946837407838572, "step": 1220}, {"loss": 1.7401, "grad_norm": 0.2866271734237671, "learning_rate": 0.0002, "epoch": 0.9545983701979045, "step": 1230}, {"loss": 1.7211, "grad_norm": 0.28394097089767456, "learning_rate": 0.0002, "epoch": 0.962359332557237, "step": 1240}, {"loss": 1.7363, "grad_norm": 0.3249266743659973, "learning_rate": 0.0002, "epoch": 0.9701202949165697, "step": 1250}, {"loss": 1.7563, "grad_norm": 0.2896869480609894, "learning_rate": 0.0002, "epoch": 0.9778812572759022, "step": 1260}, {"loss": 1.6389, "grad_norm": 0.29224586486816406, "learning_rate": 0.0002, "epoch": 0.9856422196352348, "step": 1270}, {"loss": 1.7111, "grad_norm": 0.2820223569869995, "learning_rate": 0.0002, "epoch": 0.9934031819945673, "step": 1280}, {"eval_loss": 1.8081045150756836, "eval_runtime": 102.3056, "eval_samples_per_second": 4.956, "eval_steps_per_second": 0.626, "epoch": 0.9996119518820333, "step": 1288}, {"loss": 1.7518, "grad_norm": 0.3282551169395447, "learning_rate": 0.0002, "epoch": 1.0011641443538999, "step": 1290}, {"loss": 1.6806, "grad_norm": 0.30217495560646057, "learning_rate": 0.0002, "epoch": 1.0089251067132325, "step": 1300}, {"loss": 1.6777, "grad_norm": 0.30801767110824585, "learning_rate": 0.0002, "epoch": 1.016686069072565, "step": 1310}, {"loss": 1.7756, "grad_norm": 0.31816792488098145, "learning_rate": 0.0002, "epoch": 1.0244470314318976, "step": 1320}, {"loss": 1.6986, "grad_norm": 0.27794334292411804, "learning_rate": 0.0002, "epoch": 1.03220799379123, "step": 1330}, {"loss": 1.6931, "grad_norm": 0.3018926680088043, "learning_rate": 0.0002, "epoch": 1.0399689561505627, "step": 1340}, {"loss": 1.7033, "grad_norm": 0.3552975356578827, "learning_rate": 0.0002, "epoch": 1.0477299185098952, "step": 1350}, {"loss": 1.6782, "grad_norm": 0.32590144872665405, "learning_rate": 0.0002, "epoch": 1.0554908808692278, "step": 1360}, {"loss": 1.6479, "grad_norm": 0.3435460925102234, "learning_rate": 0.0002, "epoch": 1.0632518432285603, "step": 1370}, {"loss": 1.7451, "grad_norm": 0.35037797689437866, "learning_rate": 0.0002, "epoch": 1.071012805587893, "step": 1380}, {"loss": 1.7868, "grad_norm": 0.31398263573646545, "learning_rate": 0.0002, "epoch": 1.0787737679472253, "step": 1390}, {"loss": 1.6729, "grad_norm": 0.3134010434150696, "learning_rate": 0.0002, "epoch": 1.086534730306558, "step": 1400}, {"loss": 1.751, "grad_norm": 0.4599704444408417, "learning_rate": 0.0002, "epoch": 1.0942956926658907, "step": 1410}, {"loss": 1.6871, "grad_norm": 0.35852891206741333, "learning_rate": 0.0002, "epoch": 1.102056655025223, "step": 1420}, {"loss": 1.7083, "grad_norm": 0.35628634691238403, "learning_rate": 0.0002, "epoch": 1.1098176173845558, "step": 1430}, {"loss": 1.6166, "grad_norm": 0.3769161105155945, "learning_rate": 0.0002, "epoch": 1.1175785797438882, "step": 1440}, {"loss": 1.7344, "grad_norm": 1.3712416887283325, "learning_rate": 0.0002, "epoch": 1.1253395421032208, "step": 1450}, {"loss": 1.6542, "grad_norm": 0.38406670093536377, "learning_rate": 0.0002, "epoch": 1.1331005044625533, "step": 1460}, {"loss": 1.7104, "grad_norm": 0.3402116000652313, "learning_rate": 0.0002, "epoch": 1.140861466821886, "step": 1470}, {"loss": 1.7074, "grad_norm": 0.341189444065094, "learning_rate": 0.0002, "epoch": 1.1486224291812184, "step": 1480}, {"loss": 1.6468, "grad_norm": 0.36629995703697205, "learning_rate": 0.0002, "epoch": 1.156383391540551, "step": 1490}, {"loss": 1.6952, "grad_norm": 0.3499569296836853, "learning_rate": 0.0002, "epoch": 1.1641443538998835, "step": 1500}, {"loss": 1.6625, "grad_norm": 0.3663063943386078, "learning_rate": 0.0002, "epoch": 1.1719053162592161, "step": 1510}, {"loss": 1.7533, "grad_norm": 0.34851500391960144, "learning_rate": 0.0002, "epoch": 1.1796662786185488, "step": 1520}, {"loss": 1.6092, "grad_norm": 0.35071656107902527, "learning_rate": 0.0002, "epoch": 1.1874272409778812, "step": 1530}, {"loss": 1.7206, "grad_norm": 0.42783796787261963, "learning_rate": 0.0002, "epoch": 1.1951882033372139, "step": 1540}, {"loss": 1.7499, "grad_norm": 0.31830692291259766, "learning_rate": 0.0002, "epoch": 1.2029491656965463, "step": 1550}, {"loss": 1.7372, "grad_norm": 0.3597424626350403, "learning_rate": 0.0002, "epoch": 1.210710128055879, "step": 1560}, {"loss": 1.6386, "grad_norm": 0.35233765840530396, "learning_rate": 0.0002, "epoch": 1.2184710904152114, "step": 1570}, {"loss": 1.6766, "grad_norm": 0.35942912101745605, "learning_rate": 0.0002, "epoch": 1.226232052774544, "step": 1580}, {"loss": 1.6598, "grad_norm": 0.36159393191337585, "learning_rate": 0.0002, "epoch": 1.2339930151338767, "step": 1590}, {"loss": 1.6697, "grad_norm": 0.3328469693660736, "learning_rate": 0.0002, "epoch": 1.2417539774932091, "step": 1600}, {"loss": 1.7594, "grad_norm": 0.3089476525783539, "learning_rate": 0.0002, "epoch": 1.2495149398525418, "step": 1610}, {"loss": 1.6805, "grad_norm": 0.30947765707969666, "learning_rate": 0.0002, "epoch": 1.2572759022118742, "step": 1620}, {"loss": 1.6899, "grad_norm": 0.32154011726379395, "learning_rate": 0.0002, "epoch": 1.265036864571207, "step": 1630}, {"loss": 1.6621, "grad_norm": 0.3480297923088074, "learning_rate": 0.0002, "epoch": 1.2727978269305393, "step": 1640}, {"loss": 1.7087, "grad_norm": 0.39471694827079773, "learning_rate": 0.0002, "epoch": 1.280558789289872, "step": 1650}, {"loss": 1.7608, "grad_norm": 0.35728853940963745, "learning_rate": 0.0002, "epoch": 1.2883197516492044, "step": 1660}, {"loss": 1.7008, "grad_norm": 0.35223081707954407, "learning_rate": 0.0002, "epoch": 1.296080714008537, "step": 1670}, {"loss": 1.7253, "grad_norm": 0.3588867485523224, "learning_rate": 0.0002, "epoch": 1.3038416763678695, "step": 1680}, {"loss": 1.6505, "grad_norm": 0.3528042733669281, "learning_rate": 0.0002, "epoch": 1.3116026387272022, "step": 1690}, {"loss": 1.6945, "grad_norm": 0.35975801944732666, "learning_rate": 0.0002, "epoch": 1.3193636010865348, "step": 1700}, {"loss": 1.6631, "grad_norm": 0.36691880226135254, "learning_rate": 0.0002, "epoch": 1.3271245634458673, "step": 1710}, {"loss": 1.7593, "grad_norm": 0.3787977695465088, "learning_rate": 0.0002, "epoch": 1.3348855258052, "step": 1720}, {"loss": 1.7697, "grad_norm": 0.36614933609962463, "learning_rate": 0.0002, "epoch": 1.3426464881645324, "step": 1730}, {"loss": 1.6487, "grad_norm": 0.3484745919704437, "learning_rate": 0.0002, "epoch": 1.350407450523865, "step": 1740}, {"loss": 1.7054, "grad_norm": 0.36905673146247864, "learning_rate": 0.0002, "epoch": 1.3581684128831975, "step": 1750}, {"loss": 1.7679, "grad_norm": 0.41564738750457764, "learning_rate": 0.0002, "epoch": 1.36592937524253, "step": 1760}, {"loss": 1.6634, "grad_norm": 0.3345205783843994, "learning_rate": 0.0002, "epoch": 1.3736903376018628, "step": 1770}, {"loss": 1.7275, "grad_norm": 0.34926071763038635, "learning_rate": 0.0002, "epoch": 1.3814512999611952, "step": 1780}, {"loss": 1.685, "grad_norm": 0.42004233598709106, "learning_rate": 0.0002, "epoch": 1.3892122623205276, "step": 1790}, {"loss": 1.666, "grad_norm": 0.3576236963272095, "learning_rate": 0.0002, "epoch": 1.3969732246798603, "step": 1800}, {"loss": 1.8516, "grad_norm": 0.3586704432964325, "learning_rate": 0.0002, "epoch": 1.404734187039193, "step": 1810}, {"loss": 1.6171, "grad_norm": 0.3943439722061157, "learning_rate": 0.0002, "epoch": 1.4124951493985254, "step": 1820}, {"loss": 1.6865, "grad_norm": 0.3484877049922943, "learning_rate": 0.0002, "epoch": 1.420256111757858, "step": 1830}, {"loss": 1.7205, "grad_norm": 0.3344518840312958, "learning_rate": 0.0002, "epoch": 1.4280170741171905, "step": 1840}, {"loss": 1.6999, "grad_norm": 0.4345698356628418, "learning_rate": 0.0002, "epoch": 1.4357780364765231, "step": 1850}, {"loss": 1.6855, "grad_norm": 0.5525162220001221, "learning_rate": 0.0002, "epoch": 1.4435389988358556, "step": 1860}, {"loss": 1.7143, "grad_norm": 0.37194496393203735, "learning_rate": 0.0002, "epoch": 1.4512999611951882, "step": 1870}, {"loss": 1.7623, "grad_norm": 0.34570157527923584, "learning_rate": 0.0002, "epoch": 1.4590609235545209, "step": 1880}, {"loss": 1.7, "grad_norm": 0.3512282073497772, "learning_rate": 0.0002, "epoch": 1.4668218859138533, "step": 1890}, {"loss": 1.7225, "grad_norm": 0.3443922996520996, "learning_rate": 0.0002, "epoch": 1.4745828482731858, "step": 1900}, {"loss": 1.7393, "grad_norm": 0.3812018036842346, "learning_rate": 0.0002, "epoch": 1.4823438106325184, "step": 1910}, {"loss": 1.7277, "grad_norm": 0.39263492822647095, "learning_rate": 0.0002, "epoch": 1.490104772991851, "step": 1920}, {"loss": 1.6829, "grad_norm": 0.3146156072616577, "learning_rate": 0.0002, "epoch": 1.4978657353511835, "step": 1930}, {"loss": 1.6881, "grad_norm": 0.3653988540172577, "learning_rate": 0.0002, "epoch": 1.505626697710516, "step": 1940}, {"loss": 1.7064, "grad_norm": 0.3966596722602844, "learning_rate": 0.0002, "epoch": 1.5133876600698488, "step": 1950}, {"loss": 1.6942, "grad_norm": 0.3441697359085083, "learning_rate": 0.0002, "epoch": 1.5211486224291813, "step": 1960}, {"loss": 1.7175, "grad_norm": 0.3328564465045929, "learning_rate": 0.0002, "epoch": 1.5289095847885137, "step": 1970}, {"loss": 1.7394, "grad_norm": 0.34068772196769714, "learning_rate": 0.0002, "epoch": 1.5366705471478463, "step": 1980}, {"loss": 1.7016, "grad_norm": 0.3559795916080475, "learning_rate": 0.0002, "epoch": 1.544431509507179, "step": 1990}, {"loss": 1.7102, "grad_norm": 0.37888768315315247, "learning_rate": 0.0002, "epoch": 1.5521924718665114, "step": 2000}, {"loss": 1.7094, "grad_norm": 0.36128363013267517, "learning_rate": 0.0002, "epoch": 1.5599534342258439, "step": 2010}, {"loss": 1.6407, "grad_norm": 0.3643714487552643, "learning_rate": 0.0002, "epoch": 1.5677143965851765, "step": 2020}, {"loss": 1.6777, "grad_norm": 0.3863612115383148, "learning_rate": 0.0002, "epoch": 1.5754753589445092, "step": 2030}, {"loss": 1.6575, "grad_norm": 0.32831457257270813, "learning_rate": 0.0002, "epoch": 1.5832363213038416, "step": 2040}, {"loss": 1.7404, "grad_norm": 0.36098113656044006, "learning_rate": 0.0002, "epoch": 1.5909972836631743, "step": 2050}, {"loss": 1.7065, "grad_norm": 1.1079334020614624, "learning_rate": 0.0002, "epoch": 1.598758246022507, "step": 2060}, {"loss": 1.6824, "grad_norm": 0.35615381598472595, "learning_rate": 0.0002, "epoch": 1.6065192083818394, "step": 2070}, {"loss": 1.7262, "grad_norm": 0.369711309671402, "learning_rate": 0.0002, "epoch": 1.6142801707411718, "step": 2080}, {"loss": 1.6995, "grad_norm": 0.390658438205719, "learning_rate": 0.0002, "epoch": 1.6220411331005045, "step": 2090}, {"loss": 1.6996, "grad_norm": 0.3422999382019043, "learning_rate": 0.0002, "epoch": 1.6298020954598371, "step": 2100}, {"loss": 1.7135, "grad_norm": 0.372475266456604, "learning_rate": 0.0002, "epoch": 1.6375630578191696, "step": 2110}, {"loss": 1.7216, "grad_norm": 0.35660576820373535, "learning_rate": 0.0002, "epoch": 1.645324020178502, "step": 2120}, {"loss": 1.6991, "grad_norm": 0.35754942893981934, "learning_rate": 0.0002, "epoch": 1.6530849825378346, "step": 2130}, {"loss": 1.6779, "grad_norm": 0.34572410583496094, "learning_rate": 0.0002, "epoch": 1.6608459448971673, "step": 2140}, {"loss": 1.6707, "grad_norm": 0.42059701681137085, "learning_rate": 0.0002, "epoch": 1.6686069072564997, "step": 2150}, {"loss": 1.6782, "grad_norm": 0.35200759768486023, "learning_rate": 0.0002, "epoch": 1.6763678696158324, "step": 2160}, {"loss": 1.6869, "grad_norm": 0.3704029321670532, "learning_rate": 0.0002, "epoch": 1.684128831975165, "step": 2170}, {"loss": 1.7192, "grad_norm": 0.40450501441955566, "learning_rate": 0.0002, "epoch": 1.6918897943344975, "step": 2180}, {"loss": 1.6228, "grad_norm": 0.362966924905777, "learning_rate": 0.0002, "epoch": 1.69965075669383, "step": 2190}, {"loss": 1.6935, "grad_norm": 0.36586204171180725, "learning_rate": 0.0002, "epoch": 1.7074117190531626, "step": 2200}, {"loss": 1.6088, "grad_norm": 0.3295372426509857, "learning_rate": 0.0002, "epoch": 1.7151726814124952, "step": 2210}, {"loss": 1.7844, "grad_norm": 0.3892575800418854, "learning_rate": 0.0002, "epoch": 1.7229336437718277, "step": 2220}, {"loss": 1.7805, "grad_norm": 0.34712135791778564, "learning_rate": 0.0002, "epoch": 1.73069460613116, "step": 2230}, {"loss": 1.7353, "grad_norm": 0.34801796078681946, "learning_rate": 0.0002, "epoch": 1.738455568490493, "step": 2240}, {"loss": 1.7009, "grad_norm": 0.3822397291660309, "learning_rate": 0.0002, "epoch": 1.7462165308498254, "step": 2250}, {"loss": 1.6546, "grad_norm": 0.38933250308036804, "learning_rate": 0.0002, "epoch": 1.7539774932091579, "step": 2260}, {"loss": 1.7245, "grad_norm": 0.3798373341560364, "learning_rate": 0.0002, "epoch": 1.7617384555684905, "step": 2270}, {"loss": 1.6508, "grad_norm": 0.35151317715644836, "learning_rate": 0.0002, "epoch": 1.7694994179278232, "step": 2280}, {"loss": 1.6894, "grad_norm": 0.44981494545936584, "learning_rate": 0.0002, "epoch": 1.7772603802871556, "step": 2290}, {"loss": 1.7271, "grad_norm": 0.3992624580860138, "learning_rate": 0.0002, "epoch": 1.785021342646488, "step": 2300}, {"loss": 1.7252, "grad_norm": 0.3772512376308441, "learning_rate": 0.0002, "epoch": 1.7927823050058207, "step": 2310}, {"loss": 1.7057, "grad_norm": 0.3511589467525482, "learning_rate": 0.0002, "epoch": 1.8005432673651534, "step": 2320}, {"loss": 1.764, "grad_norm": 0.3805285394191742, "learning_rate": 0.0002, "epoch": 1.8083042297244858, "step": 2330}, {"loss": 1.6986, "grad_norm": 0.3792071044445038, "learning_rate": 0.0002, "epoch": 1.8160651920838184, "step": 2340}, {"loss": 1.7759, "grad_norm": 0.36430829763412476, "learning_rate": 0.0002, "epoch": 1.823826154443151, "step": 2350}, {"loss": 1.6773, "grad_norm": 0.36502477526664734, "learning_rate": 0.0002, "epoch": 1.8315871168024835, "step": 2360}, {"loss": 1.8072, "grad_norm": 0.35015153884887695, "learning_rate": 0.0002, "epoch": 1.839348079161816, "step": 2370}, {"loss": 1.7734, "grad_norm": 0.3710903823375702, "learning_rate": 0.0002, "epoch": 1.8471090415211486, "step": 2380}, {"loss": 1.6737, "grad_norm": 0.3542828857898712, "learning_rate": 0.0002, "epoch": 1.8548700038804813, "step": 2390}, {"loss": 1.6783, "grad_norm": 0.35467568039894104, "learning_rate": 0.0002, "epoch": 1.8626309662398137, "step": 2400}, {"loss": 1.7773, "grad_norm": 0.3638560473918915, "learning_rate": 0.0002, "epoch": 1.8703919285991462, "step": 2410}, {"loss": 1.7019, "grad_norm": 0.3823298215866089, "learning_rate": 0.0002, "epoch": 1.8781528909584788, "step": 2420}, {"loss": 1.6935, "grad_norm": 0.3926416337490082, "learning_rate": 0.0002, "epoch": 1.8859138533178115, "step": 2430}, {"loss": 1.71, "grad_norm": 0.3608079254627228, "learning_rate": 0.0002, "epoch": 1.893674815677144, "step": 2440}, {"loss": 1.6654, "grad_norm": 0.3426613509654999, "learning_rate": 0.0002, "epoch": 1.9014357780364766, "step": 2450}, {"loss": 1.6892, "grad_norm": 0.3522338569164276, "learning_rate": 0.0002, "epoch": 1.9091967403958092, "step": 2460}, {"loss": 1.7307, "grad_norm": 0.3608049154281616, "learning_rate": 0.0002, "epoch": 1.9169577027551417, "step": 2470}, {"loss": 1.6823, "grad_norm": 0.3849755525588989, "learning_rate": 0.0002, "epoch": 1.924718665114474, "step": 2480}, {"loss": 1.7518, "grad_norm": 0.4154011011123657, "learning_rate": 0.0002, "epoch": 1.9324796274738067, "step": 2490}, {"loss": 1.7381, "grad_norm": 0.3602796792984009, "learning_rate": 0.0002, "epoch": 1.9402405898331394, "step": 2500}, {"loss": 1.7843, "grad_norm": 0.3702992796897888, "learning_rate": 0.0002, "epoch": 1.9480015521924718, "step": 2510}, {"loss": 1.6669, "grad_norm": 0.3657735288143158, "learning_rate": 0.0002, "epoch": 1.9557625145518043, "step": 2520}, {"loss": 1.5964, "grad_norm": 0.41031739115715027, "learning_rate": 0.0002, "epoch": 1.963523476911137, "step": 2530}, {"loss": 1.6745, "grad_norm": 0.34578680992126465, "learning_rate": 0.0002, "epoch": 1.9712844392704696, "step": 2540}, {"loss": 1.723, "grad_norm": 0.3361521065235138, "learning_rate": 0.0002, "epoch": 1.979045401629802, "step": 2550}, {"loss": 1.6868, "grad_norm": 0.34342363476753235, "learning_rate": 0.0002, "epoch": 1.9868063639891347, "step": 2560}, {"loss": 1.6577, "grad_norm": 0.32954007387161255, "learning_rate": 0.0002, "epoch": 1.9945673263484673, "step": 2570}, {"eval_loss": 1.8068748712539673, "eval_runtime": 105.5885, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.606, "epoch": 2.0, "step": 2577}, {"loss": 1.634, "grad_norm": 0.336302250623703, "learning_rate": 0.0002, "epoch": 2.0023282887077998, "step": 2580}, {"loss": 1.612, "grad_norm": 0.3627048432826996, "learning_rate": 0.0002, "epoch": 2.010089251067132, "step": 2590}, {"loss": 1.4908, "grad_norm": 0.38406702876091003, "learning_rate": 0.0002, "epoch": 2.017850213426465, "step": 2600}, {"loss": 1.5368, "grad_norm": 0.5326781272888184, "learning_rate": 0.0002, "epoch": 2.0256111757857975, "step": 2610}, {"loss": 1.5727, "grad_norm": 0.4774554967880249, "learning_rate": 0.0002, "epoch": 2.03337213814513, "step": 2620}, {"loss": 1.5422, "grad_norm": 0.4251810312271118, "learning_rate": 0.0002, "epoch": 2.0411331005044624, "step": 2630}, {"loss": 1.5152, "grad_norm": 0.4693007171154022, "learning_rate": 0.0002, "epoch": 2.0488940628637953, "step": 2640}, {"loss": 1.6137, "grad_norm": 0.46371519565582275, "learning_rate": 0.0002, "epoch": 2.0566550252231277, "step": 2650}, {"loss": 1.6304, "grad_norm": 0.46652570366859436, "learning_rate": 0.0002, "epoch": 2.06441598758246, "step": 2660}, {"loss": 1.6022, "grad_norm": 0.45200315117836, "learning_rate": 0.0002, "epoch": 2.0721769499417926, "step": 2670}, {"loss": 1.5358, "grad_norm": 0.42905205488204956, "learning_rate": 0.0002, "epoch": 2.0799379123011255, "step": 2680}, {"loss": 1.5401, "grad_norm": 0.44509148597717285, "learning_rate": 0.0002, "epoch": 2.087698874660458, "step": 2690}, {"loss": 1.5303, "grad_norm": 0.4445319175720215, "learning_rate": 0.0002, "epoch": 2.0954598370197903, "step": 2700}, {"loss": 1.5701, "grad_norm": 0.46825504302978516, "learning_rate": 0.0002, "epoch": 2.103220799379123, "step": 2710}, {"loss": 1.5751, "grad_norm": 0.4623856842517853, "learning_rate": 0.0002, "epoch": 2.1109817617384556, "step": 2720}, {"loss": 1.5601, "grad_norm": 0.4833452105522156, "learning_rate": 0.0002, "epoch": 2.118742724097788, "step": 2730}, {"loss": 1.5997, "grad_norm": 0.4582686722278595, "learning_rate": 0.0002, "epoch": 2.1265036864571205, "step": 2740}, {"loss": 1.5801, "grad_norm": 0.47587934136390686, "learning_rate": 0.0002, "epoch": 2.1342646488164534, "step": 2750}, {"loss": 1.594, "grad_norm": 0.4602217972278595, "learning_rate": 0.0002, "epoch": 2.142025611175786, "step": 2760}, {"loss": 1.5271, "grad_norm": 0.47501352429389954, "learning_rate": 0.0002, "epoch": 2.1497865735351183, "step": 2770}, {"loss": 1.4862, "grad_norm": 0.5078499913215637, "learning_rate": 0.0002, "epoch": 2.1575475358944507, "step": 2780}, {"loss": 1.6236, "grad_norm": 0.497704416513443, "learning_rate": 0.0002, "epoch": 2.1653084982537836, "step": 2790}, {"loss": 1.5597, "grad_norm": 0.5435971617698669, "learning_rate": 0.0002, "epoch": 2.173069460613116, "step": 2800}, {"loss": 1.5926, "grad_norm": 0.5172356367111206, "learning_rate": 0.0002, "epoch": 2.1808304229724484, "step": 2810}, {"loss": 1.5202, "grad_norm": 0.44063422083854675, "learning_rate": 0.0002, "epoch": 2.1885913853317813, "step": 2820}, {"loss": 1.6041, "grad_norm": 0.5079569220542908, "learning_rate": 0.0002, "epoch": 2.1963523476911138, "step": 2830}, {"loss": 1.5915, "grad_norm": 0.45658132433891296, "learning_rate": 0.0002, "epoch": 2.204113310050446, "step": 2840}, {"loss": 1.5546, "grad_norm": 0.5103023648262024, "learning_rate": 0.0002, "epoch": 2.2118742724097786, "step": 2850}, {"loss": 1.6197, "grad_norm": 0.4882226288318634, "learning_rate": 0.0002, "epoch": 2.2196352347691115, "step": 2860}, {"loss": 1.5996, "grad_norm": 0.5087296962738037, "learning_rate": 0.0002, "epoch": 2.227396197128444, "step": 2870}, {"loss": 1.5451, "grad_norm": 0.45293712615966797, "learning_rate": 0.0002, "epoch": 2.2351571594877764, "step": 2880}, {"loss": 1.6214, "grad_norm": 0.5120379328727722, "learning_rate": 0.0002, "epoch": 2.242918121847109, "step": 2890}, {"loss": 1.5273, "grad_norm": 0.47126415371894836, "learning_rate": 0.0002, "epoch": 2.2506790842064417, "step": 2900}, {"loss": 1.612, "grad_norm": 0.44005846977233887, "learning_rate": 0.0002, "epoch": 2.258440046565774, "step": 2910}, {"loss": 1.6023, "grad_norm": 0.46476176381111145, "learning_rate": 0.0002, "epoch": 2.2662010089251066, "step": 2920}, {"loss": 1.6417, "grad_norm": 0.48051515221595764, "learning_rate": 0.0002, "epoch": 2.2739619712844394, "step": 2930}, {"loss": 1.587, "grad_norm": 0.480069637298584, "learning_rate": 0.0002, "epoch": 2.281722933643772, "step": 2940}, {"loss": 1.5747, "grad_norm": 0.5122102499008179, "learning_rate": 0.0002, "epoch": 2.2894838960031043, "step": 2950}, {"loss": 1.5183, "grad_norm": 0.48879891633987427, "learning_rate": 0.0002, "epoch": 2.2972448583624367, "step": 2960}, {"loss": 1.5483, "grad_norm": 0.4973136782646179, "learning_rate": 0.0002, "epoch": 2.3050058207217696, "step": 2970}, {"loss": 1.677, "grad_norm": 0.5522695183753967, "learning_rate": 0.0002, "epoch": 2.312766783081102, "step": 2980}, {"loss": 1.5946, "grad_norm": 0.5220217704772949, "learning_rate": 0.0002, "epoch": 2.3205277454404345, "step": 2990}, {"loss": 1.6299, "grad_norm": 0.4978662431240082, "learning_rate": 0.0002, "epoch": 2.328288707799767, "step": 3000}, {"loss": 1.5498, "grad_norm": 0.554053544998169, "learning_rate": 0.0002, "epoch": 2.3360496701591, "step": 3010}, {"loss": 1.5356, "grad_norm": 0.4703886806964874, "learning_rate": 0.0002, "epoch": 2.3438106325184322, "step": 3020}, {"loss": 1.5418, "grad_norm": 0.5074123740196228, "learning_rate": 0.0002, "epoch": 2.3515715948777647, "step": 3030}, {"loss": 1.6873, "grad_norm": 0.5088278651237488, "learning_rate": 0.0002, "epoch": 2.3593325572370976, "step": 3040}, {"loss": 1.5249, "grad_norm": 0.4752114415168762, "learning_rate": 0.0002, "epoch": 2.36709351959643, "step": 3050}, {"loss": 1.5353, "grad_norm": 0.5121659636497498, "learning_rate": 0.0002, "epoch": 2.3748544819557624, "step": 3060}, {"loss": 1.6426, "grad_norm": 0.48649218678474426, "learning_rate": 0.0002, "epoch": 2.3826154443150953, "step": 3070}, {"loss": 1.6136, "grad_norm": 0.5209488868713379, "learning_rate": 0.0002, "epoch": 2.3903764066744277, "step": 3080}, {"loss": 1.597, "grad_norm": 0.5110517740249634, "learning_rate": 0.0002, "epoch": 2.39813736903376, "step": 3090}, {"loss": 1.5773, "grad_norm": 0.5609337091445923, "learning_rate": 0.0002, "epoch": 2.4058983313930926, "step": 3100}, {"loss": 1.5438, "grad_norm": 0.5191826224327087, "learning_rate": 0.0002, "epoch": 2.4136592937524255, "step": 3110}, {"loss": 1.6347, "grad_norm": 0.4876069724559784, "learning_rate": 0.0002, "epoch": 2.421420256111758, "step": 3120}, {"loss": 1.5565, "grad_norm": 0.4713933765888214, "learning_rate": 0.0002, "epoch": 2.4291812184710904, "step": 3130}, {"loss": 1.6388, "grad_norm": 0.5102227330207825, "learning_rate": 0.0002, "epoch": 2.436942180830423, "step": 3140}, {"loss": 1.5667, "grad_norm": 0.44546666741371155, "learning_rate": 0.0002, "epoch": 2.4447031431897557, "step": 3150}, {"loss": 1.5973, "grad_norm": 0.5167558193206787, "learning_rate": 0.0002, "epoch": 2.452464105549088, "step": 3160}, {"loss": 1.5673, "grad_norm": 0.5226958990097046, "learning_rate": 0.0002, "epoch": 2.4602250679084205, "step": 3170}, {"loss": 1.5758, "grad_norm": 0.4751799702644348, "learning_rate": 0.0002, "epoch": 2.4679860302677534, "step": 3180}, {"loss": 1.6234, "grad_norm": 0.4744729697704315, "learning_rate": 0.0002, "epoch": 2.475746992627086, "step": 3190}, {"loss": 1.5661, "grad_norm": 0.5203230381011963, "learning_rate": 0.0002, "epoch": 2.4835079549864183, "step": 3200}, {"loss": 1.493, "grad_norm": 0.47209781408309937, "learning_rate": 0.0002, "epoch": 2.4912689173457507, "step": 3210}, {"loss": 1.6415, "grad_norm": 0.5241674780845642, "learning_rate": 0.0002, "epoch": 2.4990298797050836, "step": 3220}, {"loss": 1.6324, "grad_norm": 0.5152244567871094, "learning_rate": 0.0002, "epoch": 2.506790842064416, "step": 3230}, {"loss": 1.6248, "grad_norm": 0.5216741561889648, "learning_rate": 0.0002, "epoch": 2.5145518044237485, "step": 3240}, {"loss": 1.5668, "grad_norm": 0.4953259527683258, "learning_rate": 0.0002, "epoch": 2.522312766783081, "step": 3250}, {"loss": 1.666, "grad_norm": 0.5973829030990601, "learning_rate": 0.0002, "epoch": 2.530073729142414, "step": 3260}, {"loss": 1.5295, "grad_norm": 0.48804202675819397, "learning_rate": 0.0002, "epoch": 2.5378346915017462, "step": 3270}, {"loss": 1.4954, "grad_norm": 0.5334644317626953, "learning_rate": 0.0002, "epoch": 2.5455956538610787, "step": 3280}, {"loss": 1.5814, "grad_norm": 0.46873313188552856, "learning_rate": 0.0002, "epoch": 2.5533566162204115, "step": 3290}, {"loss": 1.5362, "grad_norm": 0.4282589554786682, "learning_rate": 0.0002, "epoch": 2.561117578579744, "step": 3300}, {"loss": 1.6278, "grad_norm": 0.4848293960094452, "learning_rate": 0.0002, "epoch": 2.5688785409390764, "step": 3310}, {"loss": 1.6308, "grad_norm": 0.5093745589256287, "learning_rate": 0.0002, "epoch": 2.576639503298409, "step": 3320}, {"loss": 1.6375, "grad_norm": 0.5084842443466187, "learning_rate": 0.0002, "epoch": 2.5844004656577413, "step": 3330}, {"loss": 1.6168, "grad_norm": 0.4696281850337982, "learning_rate": 0.0002, "epoch": 2.592161428017074, "step": 3340}, {"loss": 1.5359, "grad_norm": 0.5767765641212463, "learning_rate": 0.0002, "epoch": 2.5999223903764066, "step": 3350}, {"loss": 1.6097, "grad_norm": 0.47300875186920166, "learning_rate": 0.0002, "epoch": 2.607683352735739, "step": 3360}, {"loss": 1.6138, "grad_norm": 0.4809158146381378, "learning_rate": 0.0002, "epoch": 2.615444315095072, "step": 3370}, {"loss": 1.4952, "grad_norm": 0.5141063928604126, "learning_rate": 0.0002, "epoch": 2.6232052774544043, "step": 3380}, {"loss": 1.5784, "grad_norm": 0.4832935035228729, "learning_rate": 0.0002, "epoch": 2.630966239813737, "step": 3390}, {"loss": 1.5796, "grad_norm": 0.5044625401496887, "learning_rate": 0.0002, "epoch": 2.6387272021730697, "step": 3400}, {"loss": 1.6202, "grad_norm": 0.5287680625915527, "learning_rate": 0.0002, "epoch": 2.646488164532402, "step": 3410}, {"loss": 1.5423, "grad_norm": 0.5306379795074463, "learning_rate": 0.0002, "epoch": 2.6542491268917345, "step": 3420}, {"loss": 1.5264, "grad_norm": 0.5849291682243347, "learning_rate": 0.0002, "epoch": 2.662010089251067, "step": 3430}, {"loss": 1.5937, "grad_norm": 0.7951080799102783, "learning_rate": 0.0002, "epoch": 2.6697710516104, "step": 3440}, {"loss": 1.5791, "grad_norm": 0.48087653517723083, "learning_rate": 0.0002, "epoch": 2.6775320139697323, "step": 3450}, {"loss": 1.6769, "grad_norm": 0.5396431684494019, "learning_rate": 0.0002, "epoch": 2.6852929763290647, "step": 3460}, {"loss": 1.606, "grad_norm": 0.5481634736061096, "learning_rate": 0.0002, "epoch": 2.693053938688397, "step": 3470}, {"loss": 1.6436, "grad_norm": 0.5068731307983398, "learning_rate": 0.0002, "epoch": 2.70081490104773, "step": 3480}, {"loss": 1.5738, "grad_norm": 0.5759826898574829, "learning_rate": 0.0002, "epoch": 2.7085758634070625, "step": 3490}, {"loss": 1.596, "grad_norm": 0.7253932952880859, "learning_rate": 0.0002, "epoch": 2.716336825766395, "step": 3500}, {"loss": 1.5791, "grad_norm": 0.527745246887207, "learning_rate": 0.0002, "epoch": 2.724097788125728, "step": 3510}, {"loss": 1.5874, "grad_norm": 0.5279242396354675, "learning_rate": 0.0002, "epoch": 2.73185875048506, "step": 3520}, {"loss": 1.6768, "grad_norm": 0.5047839283943176, "learning_rate": 0.0002, "epoch": 2.7396197128443927, "step": 3530}, {"loss": 1.5517, "grad_norm": 0.5430883169174194, "learning_rate": 0.0002, "epoch": 2.7473806752037255, "step": 3540}, {"loss": 1.5624, "grad_norm": 0.4496723711490631, "learning_rate": 0.0002, "epoch": 2.755141637563058, "step": 3550}, {"loss": 1.5789, "grad_norm": 0.5063338875770569, "learning_rate": 0.0002, "epoch": 2.7629025999223904, "step": 3560}, {"loss": 1.52, "grad_norm": 0.4619026780128479, "learning_rate": 0.0002, "epoch": 2.770663562281723, "step": 3570}, {"loss": 1.5793, "grad_norm": 0.4753304123878479, "learning_rate": 0.0002, "epoch": 2.7784245246410553, "step": 3580}, {"loss": 1.5715, "grad_norm": 0.5422708988189697, "learning_rate": 0.0002, "epoch": 2.786185487000388, "step": 3590}, {"loss": 1.5926, "grad_norm": 0.4756578803062439, "learning_rate": 0.0002, "epoch": 2.7939464493597206, "step": 3600}, {"loss": 1.5358, "grad_norm": 0.5057567358016968, "learning_rate": 0.0002, "epoch": 2.801707411719053, "step": 3610}, {"loss": 1.6131, "grad_norm": 0.5410919785499573, "learning_rate": 0.0002, "epoch": 2.809468374078386, "step": 3620}, {"loss": 1.5573, "grad_norm": 0.4958136975765228, "learning_rate": 0.0002, "epoch": 2.8172293364377183, "step": 3630}, {"loss": 1.6324, "grad_norm": 0.454527348279953, "learning_rate": 0.0002, "epoch": 2.8249902987970508, "step": 3640}, {"loss": 1.5582, "grad_norm": 0.5092706084251404, "learning_rate": 0.0002, "epoch": 2.8327512611563836, "step": 3650}, {"loss": 1.5893, "grad_norm": 0.5314022302627563, "learning_rate": 0.0002, "epoch": 2.840512223515716, "step": 3660}, {"loss": 1.588, "grad_norm": 0.5028239488601685, "learning_rate": 0.0002, "epoch": 2.8482731858750485, "step": 3670}, {"loss": 1.5751, "grad_norm": 0.5127444863319397, "learning_rate": 0.0002, "epoch": 2.856034148234381, "step": 3680}, {"loss": 1.6018, "grad_norm": 0.5045645236968994, "learning_rate": 0.0002, "epoch": 2.8637951105937134, "step": 3690}, {"loss": 1.5788, "grad_norm": 0.5560781955718994, "learning_rate": 0.0002, "epoch": 2.8715560729530463, "step": 3700}, {"loss": 1.5988, "grad_norm": 0.5177600383758545, "learning_rate": 0.0002, "epoch": 2.8793170353123787, "step": 3710}, {"loss": 1.6009, "grad_norm": 0.45830899477005005, "learning_rate": 0.0002, "epoch": 2.887077997671711, "step": 3720}, {"loss": 1.6344, "grad_norm": 0.4828629195690155, "learning_rate": 0.0002, "epoch": 2.894838960031044, "step": 3730}, {"loss": 1.6758, "grad_norm": 0.48241183161735535, "learning_rate": 0.0002, "epoch": 2.9025999223903765, "step": 3740}, {"loss": 1.5649, "grad_norm": 0.4909592568874359, "learning_rate": 0.0002, "epoch": 2.910360884749709, "step": 3750}, {"loss": 1.4927, "grad_norm": 0.44677025079727173, "learning_rate": 0.0002, "epoch": 2.9181218471090418, "step": 3760}, {"loss": 1.5067, "grad_norm": 0.4928834140300751, "learning_rate": 0.0002, "epoch": 2.925882809468374, "step": 3770}, {"loss": 1.5843, "grad_norm": 0.5673553347587585, "learning_rate": 0.0002, "epoch": 2.9336437718277066, "step": 3780}, {"loss": 1.5566, "grad_norm": 0.548190712928772, "learning_rate": 0.0002, "epoch": 2.941404734187039, "step": 3790}, {"loss": 1.5892, "grad_norm": 0.48979803919792175, "learning_rate": 0.0002, "epoch": 2.9491656965463715, "step": 3800}, {"loss": 1.5589, "grad_norm": 0.533191978931427, "learning_rate": 0.0002, "epoch": 2.9569266589057044, "step": 3810}, {"loss": 1.584, "grad_norm": 0.5362946391105652, "learning_rate": 0.0002, "epoch": 2.964687621265037, "step": 3820}, {"loss": 1.6602, "grad_norm": 0.4724906384944916, "learning_rate": 0.0002, "epoch": 2.9724485836243693, "step": 3830}, {"loss": 1.5834, "grad_norm": 0.5468461513519287, "learning_rate": 0.0002, "epoch": 2.980209545983702, "step": 3840}, {"loss": 1.6316, "grad_norm": 0.4697108864784241, "learning_rate": 0.0002, "epoch": 2.9879705083430346, "step": 3850}, {"loss": 1.6312, "grad_norm": 0.4780906140804291, "learning_rate": 0.0002, "epoch": 2.995731470702367, "step": 3860}, {"eval_loss": 1.8472607135772705, "eval_runtime": 106.5541, "eval_samples_per_second": 4.758, "eval_steps_per_second": 0.601, "epoch": 2.9996119518820334, "step": 3865}, {"loss": 1.4983, "grad_norm": 0.5645653605461121, "learning_rate": 0.0002, "epoch": 3.0034924330616994, "step": 3870}, {"loss": 1.4334, "grad_norm": 0.6457151174545288, "learning_rate": 0.0002, "epoch": 3.0112533954210323, "step": 3880}, {"loss": 1.3899, "grad_norm": 0.583838164806366, "learning_rate": 0.0002, "epoch": 3.0190143577803648, "step": 3890}, {"loss": 1.3258, "grad_norm": 0.6819260120391846, "learning_rate": 0.0002, "epoch": 3.026775320139697, "step": 3900}, {"loss": 1.3458, "grad_norm": 0.6692903637886047, "learning_rate": 0.0002, "epoch": 3.03453628249903, "step": 3910}, {"loss": 1.4356, "grad_norm": 0.6101024746894836, "learning_rate": 0.0002, "epoch": 3.0422972448583625, "step": 3920}, {"loss": 1.394, "grad_norm": 0.7014093399047852, "learning_rate": 0.0002, "epoch": 3.050058207217695, "step": 3930}, {"loss": 1.3885, "grad_norm": 0.7380381226539612, "learning_rate": 0.0002, "epoch": 3.0578191695770274, "step": 3940}, {"loss": 1.4206, "grad_norm": 0.6607900857925415, "learning_rate": 0.0002, "epoch": 3.0655801319363603, "step": 3950}, {"loss": 1.4293, "grad_norm": 0.735263466835022, "learning_rate": 0.0002, "epoch": 3.0733410942956927, "step": 3960}, {"loss": 1.3966, "grad_norm": 0.6788513660430908, "learning_rate": 0.0002, "epoch": 3.081102056655025, "step": 3970}, {"loss": 1.3435, "grad_norm": 0.6347652673721313, "learning_rate": 0.0002, "epoch": 3.088863019014358, "step": 3980}, {"loss": 1.4518, "grad_norm": 0.7056642770767212, "learning_rate": 0.0002, "epoch": 3.0966239813736904, "step": 3990}, {"loss": 1.4474, "grad_norm": 0.6387075185775757, "learning_rate": 0.0002, "epoch": 3.104384943733023, "step": 4000}, {"loss": 1.3833, "grad_norm": 0.6701116561889648, "learning_rate": 0.0002, "epoch": 3.1121459060923553, "step": 4010}, {"loss": 1.404, "grad_norm": 0.7558449506759644, "learning_rate": 0.0002, "epoch": 3.119906868451688, "step": 4020}, {"loss": 1.3294, "grad_norm": 0.6612881422042847, "learning_rate": 0.0002, "epoch": 3.1276678308110206, "step": 4030}, {"loss": 1.439, "grad_norm": 0.7474587559700012, "learning_rate": 0.0002, "epoch": 3.135428793170353, "step": 4040}, {"loss": 1.4616, "grad_norm": 0.7292373776435852, "learning_rate": 0.0002, "epoch": 3.1431897555296855, "step": 4050}, {"loss": 1.3908, "grad_norm": 0.7432886958122253, "learning_rate": 0.0002, "epoch": 3.1509507178890184, "step": 4060}, {"loss": 1.4214, "grad_norm": 0.6366098523139954, "learning_rate": 0.0002, "epoch": 3.158711680248351, "step": 4070}, {"loss": 1.5044, "grad_norm": 0.6837611794471741, "learning_rate": 0.0002, "epoch": 3.1664726426076832, "step": 4080}, {"loss": 1.4332, "grad_norm": 0.7194393277168274, "learning_rate": 0.0002, "epoch": 3.174233604967016, "step": 4090}, {"loss": 1.3628, "grad_norm": 0.6963607668876648, "learning_rate": 0.0002, "epoch": 3.1819945673263486, "step": 4100}, {"loss": 1.4127, "grad_norm": 0.6404902935028076, "learning_rate": 0.0002, "epoch": 3.189755529685681, "step": 4110}, {"loss": 1.4394, "grad_norm": 0.7172070741653442, "learning_rate": 0.0002, "epoch": 3.1975164920450134, "step": 4120}, {"loss": 1.4658, "grad_norm": 0.6577759385108948, "learning_rate": 0.0002, "epoch": 3.2052774544043463, "step": 4130}, {"loss": 1.4019, "grad_norm": 0.6658480167388916, "learning_rate": 0.0002, "epoch": 3.2130384167636787, "step": 4140}, {"loss": 1.4348, "grad_norm": 0.6771699786186218, "learning_rate": 0.0002, "epoch": 3.220799379123011, "step": 4150}, {"loss": 1.4736, "grad_norm": 0.699035108089447, "learning_rate": 0.0002, "epoch": 3.2285603414823436, "step": 4160}, {"loss": 1.4096, "grad_norm": 0.7218514680862427, "learning_rate": 0.0002, "epoch": 3.2363213038416765, "step": 4170}, {"loss": 1.3637, "grad_norm": 0.6270631551742554, "learning_rate": 0.0002, "epoch": 3.244082266201009, "step": 4180}, {"loss": 1.4076, "grad_norm": 0.6828921437263489, "learning_rate": 0.0002, "epoch": 3.2518432285603414, "step": 4190}, {"loss": 1.4663, "grad_norm": 0.6005498170852661, "learning_rate": 0.0002, "epoch": 3.2596041909196742, "step": 4200}, {"loss": 1.4798, "grad_norm": 0.6974790692329407, "learning_rate": 0.0002, "epoch": 3.2673651532790067, "step": 4210}, {"loss": 1.5012, "grad_norm": 0.7269543409347534, "learning_rate": 0.0002, "epoch": 3.275126115638339, "step": 4220}, {"loss": 1.3848, "grad_norm": 0.6728787422180176, "learning_rate": 0.0002, "epoch": 3.2828870779976715, "step": 4230}, {"loss": 1.4112, "grad_norm": 0.676972508430481, "learning_rate": 0.0002, "epoch": 3.2906480403570044, "step": 4240}, {"loss": 1.4206, "grad_norm": 0.748309314250946, "learning_rate": 0.0002, "epoch": 3.298409002716337, "step": 4250}, {"loss": 1.4973, "grad_norm": 0.6976589560508728, "learning_rate": 0.0002, "epoch": 3.3061699650756693, "step": 4260}, {"loss": 1.3967, "grad_norm": 0.649780809879303, "learning_rate": 0.0002, "epoch": 3.3139309274350017, "step": 4270}, {"loss": 1.327, "grad_norm": 0.6529902815818787, "learning_rate": 0.0002, "epoch": 3.3216918897943346, "step": 4280}, {"loss": 1.4888, "grad_norm": 0.9273163676261902, "learning_rate": 0.0002, "epoch": 3.329452852153667, "step": 4290}, {"loss": 1.4859, "grad_norm": 0.717024028301239, "learning_rate": 0.0002, "epoch": 3.3372138145129995, "step": 4300}, {"loss": 1.4441, "grad_norm": 0.7914950251579285, "learning_rate": 0.0002, "epoch": 3.3449747768723324, "step": 4310}, {"loss": 1.432, "grad_norm": 0.7133203148841858, "learning_rate": 0.0002, "epoch": 3.352735739231665, "step": 4320}, {"loss": 1.4662, "grad_norm": 0.7409568428993225, "learning_rate": 0.0002, "epoch": 3.3604967015909972, "step": 4330}, {"loss": 1.3992, "grad_norm": 0.6993981003761292, "learning_rate": 0.0002, "epoch": 3.3682576639503297, "step": 4340}, {"loss": 1.4261, "grad_norm": 0.7114535570144653, "learning_rate": 0.0002, "epoch": 3.3760186263096625, "step": 4350}, {"loss": 1.4227, "grad_norm": 0.6790860295295715, "learning_rate": 0.0002, "epoch": 3.383779588668995, "step": 4360}, {"loss": 1.4128, "grad_norm": 0.6507849097251892, "learning_rate": 0.0002, "epoch": 3.3915405510283274, "step": 4370}, {"loss": 1.4559, "grad_norm": 0.5967804193496704, "learning_rate": 0.0002, "epoch": 3.39930151338766, "step": 4380}, {"loss": 1.3687, "grad_norm": 0.6625847816467285, "learning_rate": 0.0002, "epoch": 3.4070624757469927, "step": 4390}, {"loss": 1.4193, "grad_norm": 0.6736508011817932, "learning_rate": 0.0002, "epoch": 3.414823438106325, "step": 4400}, {"loss": 1.4363, "grad_norm": 0.7870860695838928, "learning_rate": 0.0002, "epoch": 3.4225844004656576, "step": 4410}, {"loss": 1.4114, "grad_norm": 0.7205295562744141, "learning_rate": 0.0002, "epoch": 3.4303453628249905, "step": 4420}, {"loss": 1.4131, "grad_norm": 0.6634634137153625, "learning_rate": 0.0002, "epoch": 3.438106325184323, "step": 4430}, {"loss": 1.4683, "grad_norm": 0.7562733292579651, "learning_rate": 0.0002, "epoch": 3.4458672875436553, "step": 4440}, {"loss": 1.3486, "grad_norm": 0.6585879921913147, "learning_rate": 0.0002, "epoch": 3.453628249902988, "step": 4450}, {"loss": 1.4283, "grad_norm": 0.6896792054176331, "learning_rate": 0.0002, "epoch": 3.4613892122623207, "step": 4460}, {"loss": 1.4208, "grad_norm": 0.6520342230796814, "learning_rate": 0.0002, "epoch": 3.469150174621653, "step": 4470}, {"loss": 1.3423, "grad_norm": 0.6760806441307068, "learning_rate": 0.0002, "epoch": 3.4769111369809855, "step": 4480}, {"loss": 1.4398, "grad_norm": 0.7539774179458618, "learning_rate": 0.0002, "epoch": 3.484672099340318, "step": 4490}, {"loss": 1.4534, "grad_norm": 0.7409411668777466, "learning_rate": 0.0002, "epoch": 3.492433061699651, "step": 4500}, {"loss": 1.4069, "grad_norm": 0.6876253485679626, "learning_rate": 0.0002, "epoch": 3.5001940240589833, "step": 4510}, {"loss": 1.4228, "grad_norm": 0.7028461694717407, "learning_rate": 0.0002, "epoch": 3.5079549864183157, "step": 4520}, {"loss": 1.4723, "grad_norm": 0.8056529760360718, "learning_rate": 0.0002, "epoch": 3.5157159487776486, "step": 4530}, {"loss": 1.4148, "grad_norm": 0.711338996887207, "learning_rate": 0.0002, "epoch": 3.523476911136981, "step": 4540}, {"loss": 1.5247, "grad_norm": 0.7343552708625793, "learning_rate": 0.0002, "epoch": 3.5312378734963135, "step": 4550}, {"loss": 1.4308, "grad_norm": 0.745479941368103, "learning_rate": 0.0002, "epoch": 3.5389988358556463, "step": 4560}, {"loss": 1.4229, "grad_norm": 0.7582294940948486, "learning_rate": 0.0002, "epoch": 3.5467597982149788, "step": 4570}, {"loss": 1.4127, "grad_norm": 0.6717444658279419, "learning_rate": 0.0002, "epoch": 3.554520760574311, "step": 4580}, {"loss": 1.4368, "grad_norm": 0.7417883276939392, "learning_rate": 0.0002, "epoch": 3.5622817229336436, "step": 4590}, {"loss": 1.4176, "grad_norm": 0.6385737061500549, "learning_rate": 0.0002, "epoch": 3.570042685292976, "step": 4600}, {"loss": 1.3981, "grad_norm": 0.716704249382019, "learning_rate": 0.0002, "epoch": 3.577803647652309, "step": 4610}, {"loss": 1.3889, "grad_norm": 0.6948980093002319, "learning_rate": 0.0002, "epoch": 3.5855646100116414, "step": 4620}, {"loss": 1.5177, "grad_norm": 0.6961140036582947, "learning_rate": 0.0002, "epoch": 3.593325572370974, "step": 4630}, {"loss": 1.4508, "grad_norm": 0.7493122220039368, "learning_rate": 0.0002, "epoch": 3.6010865347303067, "step": 4640}, {"loss": 1.3987, "grad_norm": 0.7431658506393433, "learning_rate": 0.0002, "epoch": 3.608847497089639, "step": 4650}, {"loss": 1.4551, "grad_norm": 0.8353387713432312, "learning_rate": 0.0002, "epoch": 3.6166084594489716, "step": 4660}, {"loss": 1.4533, "grad_norm": 0.7095612287521362, "learning_rate": 0.0002, "epoch": 3.6243694218083045, "step": 4670}, {"loss": 1.4003, "grad_norm": 0.776620090007782, "learning_rate": 0.0002, "epoch": 3.632130384167637, "step": 4680}, {"loss": 1.4361, "grad_norm": 0.7198925018310547, "learning_rate": 0.0002, "epoch": 3.6398913465269693, "step": 4690}, {"loss": 1.4543, "grad_norm": 0.8238834738731384, "learning_rate": 0.0002, "epoch": 3.6476523088863018, "step": 4700}, {"loss": 1.3958, "grad_norm": 0.6804245710372925, "learning_rate": 0.0002, "epoch": 3.655413271245634, "step": 4710}, {"loss": 1.4158, "grad_norm": 0.8444845676422119, "learning_rate": 0.0002, "epoch": 3.663174233604967, "step": 4720}, {"loss": 1.3825, "grad_norm": 0.743797779083252, "learning_rate": 0.0002, "epoch": 3.6709351959642995, "step": 4730}, {"loss": 1.4213, "grad_norm": 0.8994188904762268, "learning_rate": 0.0002, "epoch": 3.678696158323632, "step": 4740}, {"loss": 1.4281, "grad_norm": 0.75416100025177, "learning_rate": 0.0002, "epoch": 3.686457120682965, "step": 4750}, {"loss": 1.4154, "grad_norm": 0.6499266028404236, "learning_rate": 0.0002, "epoch": 3.6942180830422973, "step": 4760}, {"loss": 1.4005, "grad_norm": 0.7246791124343872, "learning_rate": 0.0002, "epoch": 3.7019790454016297, "step": 4770}, {"loss": 1.426, "grad_norm": 0.7831124067306519, "learning_rate": 0.0002, "epoch": 3.7097400077609626, "step": 4780}, {"loss": 1.3933, "grad_norm": 0.7130028009414673, "learning_rate": 0.0002, "epoch": 3.717500970120295, "step": 4790}, {"loss": 1.4632, "grad_norm": 0.7501602172851562, "learning_rate": 0.0002, "epoch": 3.7252619324796274, "step": 4800}, {"loss": 1.4985, "grad_norm": 0.6980932950973511, "learning_rate": 0.0002, "epoch": 3.73302289483896, "step": 4810}, {"loss": 1.4517, "grad_norm": 0.8050530552864075, "learning_rate": 0.0002, "epoch": 3.7407838571982923, "step": 4820}, {"loss": 1.4703, "grad_norm": 0.6385579705238342, "learning_rate": 0.0002, "epoch": 3.748544819557625, "step": 4830}, {"loss": 1.5281, "grad_norm": 0.6664714813232422, "learning_rate": 0.0002, "epoch": 3.7563057819169576, "step": 4840}, {"loss": 1.4443, "grad_norm": 0.7125676274299622, "learning_rate": 0.0002, "epoch": 3.76406674427629, "step": 4850}, {"loss": 1.3958, "grad_norm": 0.7231866717338562, "learning_rate": 0.0002, "epoch": 3.771827706635623, "step": 4860}, {"loss": 1.4446, "grad_norm": 0.6917183995246887, "learning_rate": 0.0002, "epoch": 3.7795886689949554, "step": 4870}, {"loss": 1.4369, "grad_norm": 0.665037989616394, "learning_rate": 0.0002, "epoch": 3.787349631354288, "step": 4880}, {"loss": 1.4193, "grad_norm": 0.5837726593017578, "learning_rate": 0.0002, "epoch": 3.7951105937136207, "step": 4890}, {"loss": 1.4176, "grad_norm": 0.6366701722145081, "learning_rate": 0.0002, "epoch": 3.802871556072953, "step": 4900}, {"loss": 1.46, "grad_norm": 0.7082223892211914, "learning_rate": 0.0002, "epoch": 3.8106325184322856, "step": 4910}, {"loss": 1.5139, "grad_norm": 0.8101672530174255, "learning_rate": 0.0002, "epoch": 3.818393480791618, "step": 4920}, {"loss": 1.3659, "grad_norm": 0.7516148090362549, "learning_rate": 0.0002, "epoch": 3.826154443150951, "step": 4930}, {"loss": 1.3909, "grad_norm": 0.7928489446640015, "learning_rate": 0.0002, "epoch": 3.8339154055102833, "step": 4940}, {"loss": 1.4255, "grad_norm": 0.6892234683036804, "learning_rate": 0.0002, "epoch": 3.8416763678696157, "step": 4950}, {"loss": 1.5024, "grad_norm": 0.6381304264068604, "learning_rate": 0.0002, "epoch": 3.849437330228948, "step": 4960}, {"loss": 1.4873, "grad_norm": 0.8068831562995911, "learning_rate": 0.0002, "epoch": 3.857198292588281, "step": 4970}, {"loss": 1.45, "grad_norm": 0.7289869785308838, "learning_rate": 0.0002, "epoch": 3.8649592549476135, "step": 4980}, {"loss": 1.398, "grad_norm": 0.7278549075126648, "learning_rate": 0.0002, "epoch": 3.872720217306946, "step": 4990}, {"loss": 1.4442, "grad_norm": 0.7324236631393433, "learning_rate": 0.0002, "epoch": 3.880481179666279, "step": 5000}, {"loss": 1.4511, "grad_norm": 0.6759871244430542, "learning_rate": 0.0002, "epoch": 3.8882421420256112, "step": 5010}, {"loss": 1.4705, "grad_norm": 0.8159207701683044, "learning_rate": 0.0002, "epoch": 3.8960031043849437, "step": 5020}, {"loss": 1.4685, "grad_norm": 0.6536211967468262, "learning_rate": 0.0002, "epoch": 3.9037640667442766, "step": 5030}, {"loss": 1.4335, "grad_norm": 0.6827932000160217, "learning_rate": 0.0002, "epoch": 3.911525029103609, "step": 5040}, {"loss": 1.433, "grad_norm": 0.6688340306282043, "learning_rate": 0.0002, "epoch": 3.9192859914629414, "step": 5050}, {"loss": 1.4099, "grad_norm": 0.6385695934295654, "learning_rate": 0.0002, "epoch": 3.927046953822274, "step": 5060}, {"loss": 1.4767, "grad_norm": 0.6975107192993164, "learning_rate": 0.0002, "epoch": 3.9348079161816063, "step": 5070}, {"loss": 1.4893, "grad_norm": 0.6684112548828125, "learning_rate": 0.0002, "epoch": 3.942568878540939, "step": 5080}, {"loss": 1.4732, "grad_norm": 0.8349628448486328, "learning_rate": 0.0002, "epoch": 3.9503298409002716, "step": 5090}, {"loss": 1.5131, "grad_norm": 0.7146425843238831, "learning_rate": 0.0002, "epoch": 3.958090803259604, "step": 5100}, {"loss": 1.4149, "grad_norm": 0.6555036902427673, "learning_rate": 0.0002, "epoch": 3.965851765618937, "step": 5110}, {"loss": 1.4274, "grad_norm": 0.7037415504455566, "learning_rate": 0.0002, "epoch": 3.9736127279782694, "step": 5120}, {"loss": 1.4292, "grad_norm": 0.7235575914382935, "learning_rate": 0.0002, "epoch": 3.981373690337602, "step": 5130}, {"loss": 1.4455, "grad_norm": 0.7092325687408447, "learning_rate": 0.0002, "epoch": 3.9891346526969347, "step": 5140}, {"loss": 1.4512, "grad_norm": 0.7490319609642029, "learning_rate": 0.0002, "epoch": 3.996895615056267, "step": 5150}, {"eval_loss": 1.9131355285644531, "eval_runtime": 105.5778, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.606, "epoch": 4.0, "step": 5154}, {"loss": 1.2643, "grad_norm": 0.7075854539871216, "learning_rate": 0.0002, "epoch": 4.0046565774155995, "step": 5160}, {"loss": 1.209, "grad_norm": 0.9466007351875305, "learning_rate": 0.0002, "epoch": 4.012417539774932, "step": 5170}, {"loss": 1.2567, "grad_norm": 1.0297044515609741, "learning_rate": 0.0002, "epoch": 4.020178502134264, "step": 5180}, {"loss": 1.1796, "grad_norm": 0.7765059471130371, "learning_rate": 0.0002, "epoch": 4.027939464493597, "step": 5190}, {"loss": 1.2356, "grad_norm": 0.995760977268219, "learning_rate": 0.0002, "epoch": 4.03570042685293, "step": 5200}, {"loss": 1.1792, "grad_norm": 0.8663829565048218, "learning_rate": 0.0002, "epoch": 4.043461389212262, "step": 5210}, {"loss": 1.2471, "grad_norm": 1.0660825967788696, "learning_rate": 0.0002, "epoch": 4.051222351571595, "step": 5220}, {"loss": 1.1676, "grad_norm": 0.9858174920082092, "learning_rate": 0.0002, "epoch": 4.058983313930927, "step": 5230}, {"loss": 1.2448, "grad_norm": 0.8911338448524475, "learning_rate": 0.0002, "epoch": 4.06674427629026, "step": 5240}, {"loss": 1.1858, "grad_norm": 1.0848394632339478, "learning_rate": 0.0002, "epoch": 4.074505238649593, "step": 5250}, {"loss": 1.1684, "grad_norm": 1.0849905014038086, "learning_rate": 0.0002, "epoch": 4.082266201008925, "step": 5260}, {"loss": 1.2007, "grad_norm": 1.0497841835021973, "learning_rate": 0.0002, "epoch": 4.090027163368258, "step": 5270}, {"loss": 1.2552, "grad_norm": 0.8943053483963013, "learning_rate": 0.0002, "epoch": 4.0977881257275905, "step": 5280}, {"loss": 1.1923, "grad_norm": 0.8432527184486389, "learning_rate": 0.0002, "epoch": 4.1055490880869225, "step": 5290}, {"loss": 1.1634, "grad_norm": 0.9690414667129517, "learning_rate": 0.0002, "epoch": 4.113310050446255, "step": 5300}, {"loss": 1.3019, "grad_norm": 0.7790773510932922, "learning_rate": 0.0002, "epoch": 4.121071012805588, "step": 5310}, {"loss": 1.1806, "grad_norm": 0.9289211630821228, "learning_rate": 0.0002, "epoch": 4.12883197516492, "step": 5320}, {"loss": 1.1458, "grad_norm": 1.0785125494003296, "learning_rate": 0.0002, "epoch": 4.136592937524253, "step": 5330}, {"loss": 1.2086, "grad_norm": 0.8559591770172119, "learning_rate": 0.0002, "epoch": 4.144353899883585, "step": 5340}, {"loss": 1.1974, "grad_norm": 0.9405956268310547, "learning_rate": 0.0002, "epoch": 4.152114862242918, "step": 5350}, {"loss": 1.1793, "grad_norm": 0.9942827820777893, "learning_rate": 0.0002, "epoch": 4.159875824602251, "step": 5360}, {"loss": 1.1659, "grad_norm": 0.9141933917999268, "learning_rate": 0.0002, "epoch": 4.167636786961583, "step": 5370}, {"loss": 1.1647, "grad_norm": 0.8206015229225159, "learning_rate": 0.0002, "epoch": 4.175397749320916, "step": 5380}, {"loss": 1.2778, "grad_norm": 0.9340888857841492, "learning_rate": 0.0002, "epoch": 4.183158711680249, "step": 5390}, {"loss": 1.2459, "grad_norm": 1.2122114896774292, "learning_rate": 0.0002, "epoch": 4.190919674039581, "step": 5400}, {"loss": 1.2371, "grad_norm": 1.0661298036575317, "learning_rate": 0.0002, "epoch": 4.1986806363989135, "step": 5410}, {"loss": 1.1978, "grad_norm": 0.9372861385345459, "learning_rate": 0.0002, "epoch": 4.206441598758246, "step": 5420}, {"loss": 1.2653, "grad_norm": 0.894012987613678, "learning_rate": 0.0002, "epoch": 4.214202561117578, "step": 5430}, {"loss": 1.387, "grad_norm": 1.0647753477096558, "learning_rate": 0.0002, "epoch": 4.221963523476911, "step": 5440}, {"loss": 1.2231, "grad_norm": 0.989179790019989, "learning_rate": 0.0002, "epoch": 4.229724485836243, "step": 5450}, {"loss": 1.2715, "grad_norm": 1.1601181030273438, "learning_rate": 0.0002, "epoch": 4.237485448195576, "step": 5460}, {"loss": 1.2406, "grad_norm": 0.9395585656166077, "learning_rate": 0.0002, "epoch": 4.245246410554909, "step": 5470}, {"loss": 1.2779, "grad_norm": 0.9527766108512878, "learning_rate": 0.0002, "epoch": 4.253007372914241, "step": 5480}, {"loss": 1.267, "grad_norm": 1.0319520235061646, "learning_rate": 0.0002, "epoch": 4.260768335273574, "step": 5490}, {"loss": 1.2633, "grad_norm": 0.8659824728965759, "learning_rate": 0.0002, "epoch": 4.268529297632907, "step": 5500}, {"loss": 1.1475, "grad_norm": 1.099211573600769, "learning_rate": 0.0002, "epoch": 4.276290259992239, "step": 5510}, {"loss": 1.2508, "grad_norm": 0.9363361597061157, "learning_rate": 0.0002, "epoch": 4.284051222351572, "step": 5520}, {"loss": 1.189, "grad_norm": 0.8437647223472595, "learning_rate": 0.0002, "epoch": 4.2918121847109045, "step": 5530}, {"loss": 1.2212, "grad_norm": 0.9181258678436279, "learning_rate": 0.0002, "epoch": 4.2995731470702365, "step": 5540}, {"loss": 1.2092, "grad_norm": 0.9059357643127441, "learning_rate": 0.0002, "epoch": 4.307334109429569, "step": 5550}, {"loss": 1.2189, "grad_norm": 0.9337241649627686, "learning_rate": 0.0002, "epoch": 4.315095071788901, "step": 5560}, {"loss": 1.2462, "grad_norm": 0.9428889155387878, "learning_rate": 0.0002, "epoch": 4.322856034148234, "step": 5570}, {"loss": 1.2675, "grad_norm": 1.003589153289795, "learning_rate": 0.0002, "epoch": 4.330616996507567, "step": 5580}, {"loss": 1.2703, "grad_norm": 1.1249268054962158, "learning_rate": 0.0002, "epoch": 4.338377958866899, "step": 5590}, {"loss": 1.2501, "grad_norm": 0.8623469471931458, "learning_rate": 0.0002, "epoch": 4.346138921226232, "step": 5600}, {"loss": 1.2404, "grad_norm": 1.1389174461364746, "learning_rate": 0.0002, "epoch": 4.353899883585565, "step": 5610}, {"loss": 1.2245, "grad_norm": 1.0136264562606812, "learning_rate": 0.0002, "epoch": 4.361660845944897, "step": 5620}, {"loss": 1.3473, "grad_norm": 0.9567070603370667, "learning_rate": 0.0002, "epoch": 4.36942180830423, "step": 5630}, {"loss": 1.2988, "grad_norm": 1.0592148303985596, "learning_rate": 0.0002, "epoch": 4.377182770663563, "step": 5640}, {"loss": 1.212, "grad_norm": 1.0110485553741455, "learning_rate": 0.0002, "epoch": 4.384943733022895, "step": 5650}, {"loss": 1.2086, "grad_norm": 0.9914907217025757, "learning_rate": 0.0002, "epoch": 4.3927046953822275, "step": 5660}, {"loss": 1.2363, "grad_norm": 0.9447247982025146, "learning_rate": 0.0002, "epoch": 4.4004656577415595, "step": 5670}, {"loss": 1.2617, "grad_norm": 0.9644378423690796, "learning_rate": 0.0002, "epoch": 4.408226620100892, "step": 5680}, {"loss": 1.2773, "grad_norm": 0.920676589012146, "learning_rate": 0.0002, "epoch": 4.415987582460225, "step": 5690}, {"loss": 1.2792, "grad_norm": 1.060570478439331, "learning_rate": 0.0002, "epoch": 4.423748544819557, "step": 5700}, {"loss": 1.2374, "grad_norm": 0.8857738971710205, "learning_rate": 0.0002, "epoch": 4.43150950717889, "step": 5710}, {"loss": 1.2588, "grad_norm": 1.0536398887634277, "learning_rate": 0.0002, "epoch": 4.439270469538223, "step": 5720}, {"loss": 1.2051, "grad_norm": 0.990847110748291, "learning_rate": 0.0002, "epoch": 4.447031431897555, "step": 5730}, {"loss": 1.2469, "grad_norm": 0.9692499041557312, "learning_rate": 0.0002, "epoch": 4.454792394256888, "step": 5740}, {"loss": 1.2269, "grad_norm": 1.0376402139663696, "learning_rate": 0.0002, "epoch": 4.462553356616221, "step": 5750}, {"loss": 1.1701, "grad_norm": 1.3863259553909302, "learning_rate": 0.0002, "epoch": 4.470314318975553, "step": 5760}, {"loss": 1.2591, "grad_norm": 0.978379487991333, "learning_rate": 0.0002, "epoch": 4.478075281334886, "step": 5770}, {"loss": 1.2729, "grad_norm": 1.0973085165023804, "learning_rate": 0.0002, "epoch": 4.485836243694218, "step": 5780}, {"loss": 1.2404, "grad_norm": 1.057006597518921, "learning_rate": 0.0002, "epoch": 4.4935972060535505, "step": 5790}, {"loss": 1.2476, "grad_norm": 0.9247729182243347, "learning_rate": 0.0002, "epoch": 4.501358168412883, "step": 5800}, {"loss": 1.2369, "grad_norm": 1.0447787046432495, "learning_rate": 0.0002, "epoch": 4.509119130772215, "step": 5810}, {"loss": 1.211, "grad_norm": 1.1930429935455322, "learning_rate": 0.0002, "epoch": 4.516880093131548, "step": 5820}, {"loss": 1.2596, "grad_norm": 0.9867590069770813, "learning_rate": 0.0002, "epoch": 4.524641055490881, "step": 5830}, {"loss": 1.2766, "grad_norm": 0.9591100215911865, "learning_rate": 0.0002, "epoch": 4.532402017850213, "step": 5840}, {"loss": 1.2154, "grad_norm": 0.9950753450393677, "learning_rate": 0.0002, "epoch": 4.540162980209546, "step": 5850}, {"loss": 1.2149, "grad_norm": 1.0087506771087646, "learning_rate": 0.0002, "epoch": 4.547923942568879, "step": 5860}, {"loss": 1.3165, "grad_norm": 1.0934417247772217, "learning_rate": 0.0002, "epoch": 4.555684904928211, "step": 5870}, {"loss": 1.3059, "grad_norm": 1.107987403869629, "learning_rate": 0.0002, "epoch": 4.563445867287544, "step": 5880}, {"loss": 1.2184, "grad_norm": 0.9147276878356934, "learning_rate": 0.0002, "epoch": 4.571206829646876, "step": 5890}, {"loss": 1.24, "grad_norm": 1.036780595779419, "learning_rate": 0.0002, "epoch": 4.578967792006209, "step": 5900}, {"loss": 1.2209, "grad_norm": 0.9284719824790955, "learning_rate": 0.0002, "epoch": 4.5867287543655415, "step": 5910}, {"loss": 1.3693, "grad_norm": 0.9141898155212402, "learning_rate": 0.0002, "epoch": 4.5944897167248735, "step": 5920}, {"loss": 1.2319, "grad_norm": 1.0447357892990112, "learning_rate": 0.0002, "epoch": 4.602250679084206, "step": 5930}, {"loss": 1.2667, "grad_norm": 0.9309114217758179, "learning_rate": 0.0002, "epoch": 4.610011641443539, "step": 5940}, {"loss": 1.2827, "grad_norm": 1.2986129522323608, "learning_rate": 0.0002, "epoch": 4.617772603802871, "step": 5950}, {"loss": 1.312, "grad_norm": 0.9221704602241516, "learning_rate": 0.0002, "epoch": 4.625533566162204, "step": 5960}, {"loss": 1.2769, "grad_norm": 0.9228187799453735, "learning_rate": 0.0002, "epoch": 4.633294528521537, "step": 5970}, {"loss": 1.2953, "grad_norm": 0.9483116269111633, "learning_rate": 0.0002, "epoch": 4.641055490880869, "step": 5980}, {"loss": 1.3437, "grad_norm": 1.0218974351882935, "learning_rate": 0.0002, "epoch": 4.648816453240202, "step": 5990}, {"loss": 1.3085, "grad_norm": 0.9764600396156311, "learning_rate": 0.0002, "epoch": 4.656577415599534, "step": 6000}, {"loss": 1.197, "grad_norm": 0.9115710258483887, "learning_rate": 0.0002, "epoch": 4.664338377958867, "step": 6010}, {"loss": 1.1917, "grad_norm": 0.9245651364326477, "learning_rate": 0.0002, "epoch": 4.6720993403182, "step": 6020}, {"loss": 1.2969, "grad_norm": 0.9686311483383179, "learning_rate": 0.0002, "epoch": 4.6798603026775325, "step": 6030}, {"loss": 1.2702, "grad_norm": 1.1807392835617065, "learning_rate": 0.0002, "epoch": 4.6876212650368645, "step": 6040}, {"loss": 1.328, "grad_norm": 1.0358641147613525, "learning_rate": 0.0002, "epoch": 4.695382227396197, "step": 6050}, {"loss": 1.3281, "grad_norm": 0.987332284450531, "learning_rate": 0.0002, "epoch": 4.703143189755529, "step": 6060}, {"loss": 1.2514, "grad_norm": 1.0526494979858398, "learning_rate": 0.0002, "epoch": 4.710904152114862, "step": 6070}, {"loss": 1.2246, "grad_norm": 1.0276758670806885, "learning_rate": 0.0002, "epoch": 4.718665114474195, "step": 6080}, {"loss": 1.3367, "grad_norm": 0.9904406666755676, "learning_rate": 0.0002, "epoch": 4.726426076833527, "step": 6090}, {"loss": 1.2797, "grad_norm": 1.0084882974624634, "learning_rate": 0.0002, "epoch": 4.73418703919286, "step": 6100}, {"loss": 1.2656, "grad_norm": 0.8646450638771057, "learning_rate": 0.0002, "epoch": 4.741948001552192, "step": 6110}, {"loss": 1.3063, "grad_norm": 0.9233377575874329, "learning_rate": 0.0002, "epoch": 4.749708963911525, "step": 6120}, {"loss": 1.2642, "grad_norm": 0.9675140976905823, "learning_rate": 0.0002, "epoch": 4.757469926270858, "step": 6130}, {"loss": 1.3367, "grad_norm": 0.9639796018600464, "learning_rate": 0.0002, "epoch": 4.765230888630191, "step": 6140}, {"loss": 1.276, "grad_norm": 0.925199568271637, "learning_rate": 0.0002, "epoch": 4.772991850989523, "step": 6150}, {"loss": 1.2441, "grad_norm": 1.050901174545288, "learning_rate": 0.0002, "epoch": 4.7807528133488555, "step": 6160}, {"loss": 1.301, "grad_norm": 0.8920623660087585, "learning_rate": 0.0002, "epoch": 4.7885137757081875, "step": 6170}, {"loss": 1.263, "grad_norm": 0.8964757919311523, "learning_rate": 0.0002, "epoch": 4.79627473806752, "step": 6180}, {"loss": 1.2787, "grad_norm": 1.0839070081710815, "learning_rate": 0.0002, "epoch": 4.804035700426853, "step": 6190}, {"loss": 1.2664, "grad_norm": 0.8809942007064819, "learning_rate": 0.0002, "epoch": 4.811796662786185, "step": 6200}, {"loss": 1.321, "grad_norm": 1.0216195583343506, "learning_rate": 0.0002, "epoch": 4.819557625145518, "step": 6210}, {"loss": 1.3033, "grad_norm": 0.892005980014801, "learning_rate": 0.0002, "epoch": 4.827318587504851, "step": 6220}, {"loss": 1.2602, "grad_norm": 0.9957166910171509, "learning_rate": 0.0002, "epoch": 4.835079549864183, "step": 6230}, {"loss": 1.3562, "grad_norm": 0.9720533490180969, "learning_rate": 0.0002, "epoch": 4.842840512223516, "step": 6240}, {"loss": 1.2651, "grad_norm": 0.9336182475090027, "learning_rate": 0.0002, "epoch": 4.850601474582849, "step": 6250}, {"loss": 1.3136, "grad_norm": 1.2611457109451294, "learning_rate": 0.0002, "epoch": 4.858362436942181, "step": 6260}, {"loss": 1.2234, "grad_norm": 0.8927203416824341, "learning_rate": 0.0002, "epoch": 4.866123399301514, "step": 6270}, {"loss": 1.3463, "grad_norm": 0.9706710577011108, "learning_rate": 0.0002, "epoch": 4.873884361660846, "step": 6280}, {"loss": 1.3209, "grad_norm": 1.1461690664291382, "learning_rate": 0.0002, "epoch": 4.8816453240201785, "step": 6290}, {"loss": 1.2566, "grad_norm": 0.9930381178855896, "learning_rate": 0.0002, "epoch": 4.889406286379511, "step": 6300}, {"loss": 1.2568, "grad_norm": 0.91451096534729, "learning_rate": 0.0002, "epoch": 4.897167248738843, "step": 6310}, {"loss": 1.2836, "grad_norm": 1.0319571495056152, "learning_rate": 0.0002, "epoch": 4.904928211098176, "step": 6320}, {"loss": 1.2908, "grad_norm": 0.990140438079834, "learning_rate": 0.0002, "epoch": 4.912689173457509, "step": 6330}, {"loss": 1.3299, "grad_norm": 1.2466117143630981, "learning_rate": 0.0002, "epoch": 4.920450135816841, "step": 6340}, {"loss": 1.2659, "grad_norm": 1.0316979885101318, "learning_rate": 0.0002, "epoch": 4.928211098176174, "step": 6350}, {"loss": 1.3292, "grad_norm": 1.0643759965896606, "learning_rate": 0.0002, "epoch": 4.935972060535507, "step": 6360}, {"loss": 1.2559, "grad_norm": 0.9703279733657837, "learning_rate": 0.0002, "epoch": 4.943733022894839, "step": 6370}, {"loss": 1.2155, "grad_norm": 0.9767927527427673, "learning_rate": 0.0002, "epoch": 4.951493985254172, "step": 6380}, {"loss": 1.2437, "grad_norm": 0.960854172706604, "learning_rate": 0.0002, "epoch": 4.959254947613504, "step": 6390}, {"loss": 1.3314, "grad_norm": 0.9922910332679749, "learning_rate": 0.0002, "epoch": 4.967015909972837, "step": 6400}, {"loss": 1.3018, "grad_norm": 0.956470787525177, "learning_rate": 0.0002, "epoch": 4.9747768723321695, "step": 6410}, {"loss": 1.2794, "grad_norm": 0.9637242555618286, "learning_rate": 0.0002, "epoch": 4.9825378346915015, "step": 6420}, {"loss": 1.3236, "grad_norm": 1.0855202674865723, "learning_rate": 0.0002, "epoch": 4.990298797050834, "step": 6430}, {"loss": 1.3015, "grad_norm": 0.9655316472053528, "learning_rate": 0.0002, "epoch": 4.998059759410167, "step": 6440}, {"eval_loss": 2.0410802364349365, "eval_runtime": 113.04, "eval_samples_per_second": 4.485, "eval_steps_per_second": 0.566, "epoch": 4.9996119518820334, "step": 6442}, {"loss": 1.0846, "grad_norm": 1.1676199436187744, "learning_rate": 0.0002, "epoch": 5.005820721769499, "step": 6450}, {"loss": 1.041, "grad_norm": 1.4317965507507324, "learning_rate": 0.0002, "epoch": 5.013581684128832, "step": 6460}, {"loss": 0.9546, "grad_norm": 1.460443377494812, "learning_rate": 0.0002, "epoch": 5.021342646488165, "step": 6470}, {"loss": 1.0014, "grad_norm": 1.2299214601516724, "learning_rate": 0.0002, "epoch": 5.029103608847497, "step": 6480}, {"loss": 1.0397, "grad_norm": 1.3125724792480469, "learning_rate": 0.0002, "epoch": 5.03686457120683, "step": 6490}, {"loss": 1.0134, "grad_norm": 1.1252319812774658, "learning_rate": 0.0002, "epoch": 5.044625533566162, "step": 6500}, {"loss": 0.976, "grad_norm": 0.9970866441726685, "learning_rate": 0.0002, "epoch": 5.052386495925495, "step": 6510}, {"loss": 0.9731, "grad_norm": 1.229069709777832, "learning_rate": 0.0002, "epoch": 5.060147458284828, "step": 6520}, {"loss": 1.0498, "grad_norm": 1.2430938482284546, "learning_rate": 0.0002, "epoch": 5.06790842064416, "step": 6530}, {"loss": 1.0236, "grad_norm": 1.0522737503051758, "learning_rate": 0.0002, "epoch": 5.0756693830034925, "step": 6540}, {"loss": 1.0221, "grad_norm": 1.108890175819397, "learning_rate": 0.0002, "epoch": 5.083430345362825, "step": 6550}, {"loss": 1.0177, "grad_norm": 1.156912922859192, "learning_rate": 0.0002, "epoch": 5.091191307722157, "step": 6560}, {"loss": 1.0415, "grad_norm": 1.405895709991455, "learning_rate": 0.0002, "epoch": 5.09895227008149, "step": 6570}, {"loss": 0.9811, "grad_norm": 1.2005155086517334, "learning_rate": 0.0002, "epoch": 5.106713232440823, "step": 6580}, {"loss": 0.9862, "grad_norm": 1.181443452835083, "learning_rate": 0.0002, "epoch": 5.114474194800155, "step": 6590}, {"loss": 1.0291, "grad_norm": 2.3444771766662598, "learning_rate": 0.0002, "epoch": 5.122235157159488, "step": 6600}, {"loss": 1.0455, "grad_norm": 1.216988444328308, "learning_rate": 0.0002, "epoch": 5.12999611951882, "step": 6610}, {"loss": 1.0549, "grad_norm": 1.369553565979004, "learning_rate": 0.0002, "epoch": 5.137757081878153, "step": 6620}, {"loss": 1.0056, "grad_norm": 1.177964687347412, "learning_rate": 0.0002, "epoch": 5.145518044237486, "step": 6630}, {"loss": 1.1025, "grad_norm": 1.1397041082382202, "learning_rate": 0.0002, "epoch": 5.153279006596818, "step": 6640}, {"loss": 1.0437, "grad_norm": 1.3976861238479614, "learning_rate": 0.0002, "epoch": 5.161039968956151, "step": 6650}, {"loss": 1.0454, "grad_norm": 1.4824495315551758, "learning_rate": 0.0002, "epoch": 5.1688009313154835, "step": 6660}, {"loss": 1.0356, "grad_norm": 1.2653018236160278, "learning_rate": 0.0002, "epoch": 5.1765618936748155, "step": 6670}, {"loss": 0.9971, "grad_norm": 1.3106069564819336, "learning_rate": 0.0002, "epoch": 5.184322856034148, "step": 6680}, {"loss": 1.0561, "grad_norm": 1.3140279054641724, "learning_rate": 0.0002, "epoch": 5.192083818393481, "step": 6690}, {"loss": 1.0618, "grad_norm": 1.3900256156921387, "learning_rate": 0.0002, "epoch": 5.199844780752813, "step": 6700}, {"loss": 1.0285, "grad_norm": 1.3191124200820923, "learning_rate": 0.0002, "epoch": 5.207605743112146, "step": 6710}, {"loss": 0.9921, "grad_norm": 1.176107406616211, "learning_rate": 0.0002, "epoch": 5.215366705471478, "step": 6720}, {"loss": 1.064, "grad_norm": 1.2364883422851562, "learning_rate": 0.0002, "epoch": 5.223127667830811, "step": 6730}, {"loss": 0.9599, "grad_norm": 1.343022108078003, "learning_rate": 0.0002, "epoch": 5.230888630190144, "step": 6740}, {"loss": 1.0342, "grad_norm": 1.2826898097991943, "learning_rate": 0.0002, "epoch": 5.238649592549476, "step": 6750}, {"loss": 1.0703, "grad_norm": 1.500257134437561, "learning_rate": 0.0002, "epoch": 5.246410554908809, "step": 6760}, {"loss": 1.0114, "grad_norm": 1.2605743408203125, "learning_rate": 0.0002, "epoch": 5.254171517268142, "step": 6770}, {"loss": 1.0825, "grad_norm": 1.2355525493621826, "learning_rate": 0.0002, "epoch": 5.261932479627474, "step": 6780}, {"loss": 1.0436, "grad_norm": 1.2845789194107056, "learning_rate": 0.0002, "epoch": 5.2696934419868064, "step": 6790}, {"loss": 0.989, "grad_norm": 1.3696625232696533, "learning_rate": 0.0002, "epoch": 5.277454404346139, "step": 6800}, {"loss": 1.0991, "grad_norm": 1.4051260948181152, "learning_rate": 0.0002, "epoch": 5.285215366705471, "step": 6810}, {"loss": 1.0987, "grad_norm": 1.266725778579712, "learning_rate": 0.0002, "epoch": 5.292976329064804, "step": 6820}, {"loss": 1.0489, "grad_norm": 1.3475236892700195, "learning_rate": 0.0002, "epoch": 5.300737291424136, "step": 6830}, {"loss": 1.0264, "grad_norm": 1.54409921169281, "learning_rate": 0.0002, "epoch": 5.308498253783469, "step": 6840}, {"loss": 1.033, "grad_norm": 1.2391985654830933, "learning_rate": 0.0002, "epoch": 5.316259216142802, "step": 6850}, {"loss": 1.1058, "grad_norm": 1.2435699701309204, "learning_rate": 0.0002, "epoch": 5.324020178502134, "step": 6860}, {"loss": 1.0179, "grad_norm": 1.8803037405014038, "learning_rate": 0.0002, "epoch": 5.331781140861467, "step": 6870}, {"loss": 0.997, "grad_norm": 1.4195542335510254, "learning_rate": 0.0002, "epoch": 5.3395421032208, "step": 6880}, {"loss": 1.0273, "grad_norm": 1.1853394508361816, "learning_rate": 0.0002, "epoch": 5.347303065580132, "step": 6890}, {"loss": 1.0668, "grad_norm": 1.4016530513763428, "learning_rate": 0.0002, "epoch": 5.355064027939465, "step": 6900}, {"loss": 1.1099, "grad_norm": 1.294339895248413, "learning_rate": 0.0002, "epoch": 5.3628249902987974, "step": 6910}, {"loss": 1.0724, "grad_norm": 1.2952708005905151, "learning_rate": 0.0002, "epoch": 5.370585952658129, "step": 6920}, {"loss": 1.0098, "grad_norm": 1.1361510753631592, "learning_rate": 0.0002, "epoch": 5.378346915017462, "step": 6930}, {"loss": 1.0796, "grad_norm": 1.125805377960205, "learning_rate": 0.0002, "epoch": 5.386107877376794, "step": 6940}, {"loss": 1.122, "grad_norm": 1.1453300714492798, "learning_rate": 0.0002, "epoch": 5.393868839736127, "step": 6950}, {"loss": 1.0977, "grad_norm": 1.4542768001556396, "learning_rate": 0.0002, "epoch": 5.40162980209546, "step": 6960}, {"loss": 1.0825, "grad_norm": 1.2360988855361938, "learning_rate": 0.0002, "epoch": 5.409390764454792, "step": 6970}, {"loss": 1.0631, "grad_norm": 1.2182754278182983, "learning_rate": 0.0002, "epoch": 5.417151726814125, "step": 6980}, {"loss": 1.0471, "grad_norm": 1.2018693685531616, "learning_rate": 0.0002, "epoch": 5.424912689173458, "step": 6990}, {"loss": 1.108, "grad_norm": 1.346124291419983, "learning_rate": 0.0002, "epoch": 5.43267365153279, "step": 7000}, {"loss": 1.0534, "grad_norm": 1.2534189224243164, "learning_rate": 0.0002, "epoch": 5.440434613892123, "step": 7010}, {"loss": 1.0696, "grad_norm": 1.2033339738845825, "learning_rate": 0.0002, "epoch": 5.448195576251456, "step": 7020}, {"loss": 1.0714, "grad_norm": 1.2788134813308716, "learning_rate": 0.0002, "epoch": 5.4559565386107876, "step": 7030}, {"loss": 1.1274, "grad_norm": 1.2751542329788208, "learning_rate": 0.0002, "epoch": 5.46371750097012, "step": 7040}, {"loss": 1.0767, "grad_norm": 1.3237019777297974, "learning_rate": 0.0002, "epoch": 5.471478463329452, "step": 7050}, {"loss": 1.1081, "grad_norm": 1.4932852983474731, "learning_rate": 0.0002, "epoch": 5.479239425688785, "step": 7060}, {"loss": 1.0197, "grad_norm": 1.4003876447677612, "learning_rate": 0.0002, "epoch": 5.487000388048118, "step": 7070}, {"loss": 1.0662, "grad_norm": 1.404799461364746, "learning_rate": 0.0002, "epoch": 5.49476135040745, "step": 7080}, {"loss": 1.0354, "grad_norm": 1.4486982822418213, "learning_rate": 0.0002, "epoch": 5.502522312766783, "step": 7090}, {"loss": 1.0645, "grad_norm": 1.1713480949401855, "learning_rate": 0.0002, "epoch": 5.510283275126116, "step": 7100}, {"loss": 1.006, "grad_norm": 1.4062601327896118, "learning_rate": 0.0002, "epoch": 5.518044237485448, "step": 7110}, {"loss": 1.0459, "grad_norm": 1.211629867553711, "learning_rate": 0.0002, "epoch": 5.525805199844781, "step": 7120}, {"loss": 1.102, "grad_norm": 1.2523176670074463, "learning_rate": 0.0002, "epoch": 5.533566162204114, "step": 7130}, {"loss": 1.1132, "grad_norm": 1.4467198848724365, "learning_rate": 0.0002, "epoch": 5.541327124563446, "step": 7140}, {"loss": 1.1557, "grad_norm": 1.5961614847183228, "learning_rate": 0.0002, "epoch": 5.5490880869227786, "step": 7150}, {"loss": 1.0859, "grad_norm": 1.320656418800354, "learning_rate": 0.0002, "epoch": 5.5568490492821105, "step": 7160}, {"loss": 1.109, "grad_norm": 1.2423332929611206, "learning_rate": 0.0002, "epoch": 5.564610011641443, "step": 7170}, {"loss": 1.0046, "grad_norm": 1.2919669151306152, "learning_rate": 0.0002, "epoch": 5.572370974000776, "step": 7180}, {"loss": 1.046, "grad_norm": 1.1678385734558105, "learning_rate": 0.0002, "epoch": 5.580131936360108, "step": 7190}, {"loss": 1.1011, "grad_norm": 1.4250764846801758, "learning_rate": 0.0002, "epoch": 5.587892898719441, "step": 7200}, {"loss": 1.1254, "grad_norm": 1.5308716297149658, "learning_rate": 0.0002, "epoch": 5.595653861078774, "step": 7210}, {"loss": 1.121, "grad_norm": 1.2678815126419067, "learning_rate": 0.0002, "epoch": 5.603414823438106, "step": 7220}, {"loss": 1.0846, "grad_norm": 1.127856969833374, "learning_rate": 0.0002, "epoch": 5.611175785797439, "step": 7230}, {"loss": 1.0647, "grad_norm": 1.3832560777664185, "learning_rate": 0.0002, "epoch": 5.618936748156772, "step": 7240}, {"loss": 1.0658, "grad_norm": 1.3226919174194336, "learning_rate": 0.0002, "epoch": 5.626697710516104, "step": 7250}, {"loss": 1.1175, "grad_norm": 1.3418006896972656, "learning_rate": 0.0002, "epoch": 5.634458672875437, "step": 7260}, {"loss": 1.0956, "grad_norm": 1.2625300884246826, "learning_rate": 0.0002, "epoch": 5.642219635234769, "step": 7270}, {"loss": 1.067, "grad_norm": 1.1579464673995972, "learning_rate": 0.0002, "epoch": 5.6499805975941015, "step": 7280}, {"loss": 1.0447, "grad_norm": 1.4998650550842285, "learning_rate": 0.0002, "epoch": 5.657741559953434, "step": 7290}, {"loss": 1.1256, "grad_norm": 1.2670758962631226, "learning_rate": 0.0002, "epoch": 5.665502522312766, "step": 7300}, {"loss": 1.1267, "grad_norm": 1.2959760427474976, "learning_rate": 0.0002, "epoch": 5.673263484672099, "step": 7310}, {"loss": 1.1387, "grad_norm": 1.2460671663284302, "learning_rate": 0.0002, "epoch": 5.681024447031432, "step": 7320}, {"loss": 1.0756, "grad_norm": 1.1313989162445068, "learning_rate": 0.0002, "epoch": 5.688785409390764, "step": 7330}, {"loss": 1.0618, "grad_norm": 1.282527208328247, "learning_rate": 0.0002, "epoch": 5.696546371750097, "step": 7340}, {"loss": 1.1315, "grad_norm": 1.3380206823349, "learning_rate": 0.0002, "epoch": 5.70430733410943, "step": 7350}, {"loss": 1.0949, "grad_norm": 1.1648279428482056, "learning_rate": 0.0002, "epoch": 5.712068296468762, "step": 7360}, {"loss": 1.1705, "grad_norm": 1.3059816360473633, "learning_rate": 0.0002, "epoch": 5.719829258828095, "step": 7370}, {"loss": 1.1496, "grad_norm": 1.1905046701431274, "learning_rate": 0.0002, "epoch": 5.727590221187427, "step": 7380}, {"loss": 1.1356, "grad_norm": 1.4089630842208862, "learning_rate": 0.0002, "epoch": 5.73535118354676, "step": 7390}, {"loss": 1.1349, "grad_norm": 1.256721019744873, "learning_rate": 0.0002, "epoch": 5.7431121459060925, "step": 7400}, {"loss": 1.0682, "grad_norm": 1.1915162801742554, "learning_rate": 0.0002, "epoch": 5.7508731082654245, "step": 7410}, {"loss": 1.1257, "grad_norm": 1.1935480833053589, "learning_rate": 0.0002, "epoch": 5.758634070624757, "step": 7420}, {"loss": 1.1348, "grad_norm": 1.1761008501052856, "learning_rate": 0.0002, "epoch": 5.76639503298409, "step": 7430}, {"loss": 1.0837, "grad_norm": 1.2540549039840698, "learning_rate": 0.0002, "epoch": 5.774155995343422, "step": 7440}, {"loss": 1.1527, "grad_norm": 1.5295120477676392, "learning_rate": 0.0002, "epoch": 5.781916957702755, "step": 7450}, {"loss": 1.1146, "grad_norm": 1.1081160306930542, "learning_rate": 0.0002, "epoch": 5.789677920062088, "step": 7460}, {"loss": 1.1304, "grad_norm": 1.4381253719329834, "learning_rate": 0.0002, "epoch": 5.79743888242142, "step": 7470}, {"loss": 1.0684, "grad_norm": 1.3079341650009155, "learning_rate": 0.0002, "epoch": 5.805199844780753, "step": 7480}, {"loss": 1.0544, "grad_norm": 1.1372792720794678, "learning_rate": 0.0002, "epoch": 5.812960807140085, "step": 7490}, {"loss": 1.1622, "grad_norm": 1.3221744298934937, "learning_rate": 0.0002, "epoch": 5.820721769499418, "step": 7500}, {"loss": 1.1515, "grad_norm": 1.3436939716339111, "learning_rate": 0.0002, "epoch": 5.828482731858751, "step": 7510}, {"loss": 1.1154, "grad_norm": 1.3916879892349243, "learning_rate": 0.0002, "epoch": 5.8362436942180835, "step": 7520}, {"loss": 1.0816, "grad_norm": 1.2463704347610474, "learning_rate": 0.0002, "epoch": 5.8440046565774155, "step": 7530}, {"loss": 1.0745, "grad_norm": 1.097051739692688, "learning_rate": 0.0002, "epoch": 5.851765618936748, "step": 7540}, {"loss": 1.1454, "grad_norm": 1.1554739475250244, "learning_rate": 0.0002, "epoch": 5.85952658129608, "step": 7550}, {"loss": 1.0953, "grad_norm": 1.2384694814682007, "learning_rate": 0.0002, "epoch": 5.867287543655413, "step": 7560}, {"loss": 1.1734, "grad_norm": 1.142815351486206, "learning_rate": 0.0002, "epoch": 5.875048506014746, "step": 7570}, {"loss": 1.162, "grad_norm": 1.3637062311172485, "learning_rate": 0.0002, "epoch": 5.882809468374078, "step": 7580}, {"loss": 1.0781, "grad_norm": 1.2449073791503906, "learning_rate": 0.0002, "epoch": 5.890570430733411, "step": 7590}, {"loss": 1.1191, "grad_norm": 1.358058214187622, "learning_rate": 0.0002, "epoch": 5.898331393092743, "step": 7600}, {"loss": 1.0779, "grad_norm": 1.264655351638794, "learning_rate": 0.0002, "epoch": 5.906092355452076, "step": 7610}, {"loss": 1.1538, "grad_norm": 1.3186019659042358, "learning_rate": 0.0002, "epoch": 5.913853317811409, "step": 7620}, {"loss": 1.1076, "grad_norm": 1.4111460447311401, "learning_rate": 0.0002, "epoch": 5.921614280170742, "step": 7630}, {"loss": 1.1765, "grad_norm": 1.1078972816467285, "learning_rate": 0.0002, "epoch": 5.929375242530074, "step": 7640}, {"loss": 1.1305, "grad_norm": 1.2742213010787964, "learning_rate": 0.0002, "epoch": 5.9371362048894065, "step": 7650}, {"loss": 1.144, "grad_norm": 1.3412781953811646, "learning_rate": 0.0002, "epoch": 5.9448971672487385, "step": 7660}, {"loss": 1.1642, "grad_norm": 1.123005986213684, "learning_rate": 0.0002, "epoch": 5.952658129608071, "step": 7670}, {"loss": 1.0732, "grad_norm": 1.2203444242477417, "learning_rate": 0.0002, "epoch": 5.960419091967404, "step": 7680}, {"loss": 1.158, "grad_norm": 1.341011643409729, "learning_rate": 0.0002, "epoch": 5.968180054326736, "step": 7690}, {"loss": 1.1144, "grad_norm": 1.2689454555511475, "learning_rate": 0.0002, "epoch": 5.975941016686069, "step": 7700}, {"loss": 1.2051, "grad_norm": 1.1518112421035767, "learning_rate": 0.0002, "epoch": 5.983701979045401, "step": 7710}, {"loss": 1.1868, "grad_norm": 1.3698320388793945, "learning_rate": 0.0002, "epoch": 5.991462941404734, "step": 7720}, {"loss": 1.0651, "grad_norm": 1.2812788486480713, "learning_rate": 0.0002, "epoch": 5.999223903764067, "step": 7730}]} +{"epoch": 6.9996119518820334, "step": 9019, "epoch_duration": 3941.118304014206, "total_accumulated_duration": 25912.183322429657, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0855, "grad_norm": 1.0751162767410278, "learning_rate": 0.0002, "epoch": 0.007760962359332557, "step": 10}, {"loss": 2.4744, "grad_norm": 0.4697345793247223, "learning_rate": 0.0002, "epoch": 0.015521924718665115, "step": 20}, {"loss": 2.193, "grad_norm": 0.5370839238166809, "learning_rate": 0.0002, "epoch": 0.023282887077997673, "step": 30}, {"loss": 2.0599, "grad_norm": 0.46794816851615906, "learning_rate": 0.0002, "epoch": 0.03104384943733023, "step": 40}, {"loss": 1.9354, "grad_norm": 0.44624820351600647, "learning_rate": 0.0002, "epoch": 0.038804811796662786, "step": 50}, {"loss": 1.9319, "grad_norm": 0.3953201472759247, "learning_rate": 0.0002, "epoch": 0.046565774155995346, "step": 60}, {"loss": 1.9099, "grad_norm": 0.3935912549495697, "learning_rate": 0.0002, "epoch": 0.0543267365153279, "step": 70}, {"loss": 1.8795, "grad_norm": 0.4520699381828308, "learning_rate": 0.0002, "epoch": 0.06208769887466046, "step": 80}, {"loss": 1.8354, "grad_norm": 0.3801847994327545, "learning_rate": 0.0002, "epoch": 0.06984866123399301, "step": 90}, {"loss": 1.9053, "grad_norm": 0.4020165205001831, "learning_rate": 0.0002, "epoch": 0.07760962359332557, "step": 100}, {"loss": 1.8779, "grad_norm": 0.3860672116279602, "learning_rate": 0.0002, "epoch": 0.08537058595265813, "step": 110}, {"loss": 1.8731, "grad_norm": 0.3681113123893738, "learning_rate": 0.0002, "epoch": 0.09313154831199069, "step": 120}, {"loss": 1.8157, "grad_norm": 0.3594866991043091, "learning_rate": 0.0002, "epoch": 0.10089251067132324, "step": 130}, {"loss": 1.8266, "grad_norm": 0.3879193663597107, "learning_rate": 0.0002, "epoch": 0.1086534730306558, "step": 140}, {"loss": 1.8818, "grad_norm": 0.3270505666732788, "learning_rate": 0.0002, "epoch": 0.11641443538998836, "step": 150}, {"loss": 1.87, "grad_norm": 0.36824458837509155, "learning_rate": 0.0002, "epoch": 0.12417539774932092, "step": 160}, {"loss": 1.8305, "grad_norm": 0.383882075548172, "learning_rate": 0.0002, "epoch": 0.13193636010865348, "step": 170}, {"loss": 1.8584, "grad_norm": 0.3368665874004364, "learning_rate": 0.0002, "epoch": 0.13969732246798602, "step": 180}, {"loss": 1.7882, "grad_norm": 0.35961097478866577, "learning_rate": 0.0002, "epoch": 0.1474582848273186, "step": 190}, {"loss": 1.8467, "grad_norm": 0.3415963351726532, "learning_rate": 0.0002, "epoch": 0.15521924718665114, "step": 200}, {"loss": 1.8543, "grad_norm": 0.4100632071495056, "learning_rate": 0.0002, "epoch": 0.1629802095459837, "step": 210}, {"loss": 1.8226, "grad_norm": 0.3516307473182678, "learning_rate": 0.0002, "epoch": 0.17074117190531626, "step": 220}, {"loss": 1.7386, "grad_norm": 0.37919050455093384, "learning_rate": 0.0002, "epoch": 0.1785021342646488, "step": 230}, {"loss": 1.7937, "grad_norm": 0.33270683884620667, "learning_rate": 0.0002, "epoch": 0.18626309662398138, "step": 240}, {"loss": 1.7925, "grad_norm": 0.3348783254623413, "learning_rate": 0.0002, "epoch": 0.19402405898331393, "step": 250}, {"loss": 1.7774, "grad_norm": 0.3888475298881531, "learning_rate": 0.0002, "epoch": 0.20178502134264648, "step": 260}, {"loss": 1.8381, "grad_norm": 0.3554602861404419, "learning_rate": 0.0002, "epoch": 0.20954598370197905, "step": 270}, {"loss": 1.8359, "grad_norm": 0.33277708292007446, "learning_rate": 0.0002, "epoch": 0.2173069460613116, "step": 280}, {"loss": 1.7713, "grad_norm": 0.3281584680080414, "learning_rate": 0.0002, "epoch": 0.22506790842064417, "step": 290}, {"loss": 1.8181, "grad_norm": 0.3185969591140747, "learning_rate": 0.0002, "epoch": 0.23282887077997672, "step": 300}, {"loss": 1.8595, "grad_norm": 0.35335442423820496, "learning_rate": 0.0002, "epoch": 0.24058983313930926, "step": 310}, {"loss": 1.87, "grad_norm": 0.3119595944881439, "learning_rate": 0.0002, "epoch": 0.24835079549864184, "step": 320}, {"loss": 1.8357, "grad_norm": 0.36424458026885986, "learning_rate": 0.0002, "epoch": 0.2561117578579744, "step": 330}, {"loss": 1.8003, "grad_norm": 0.3618951141834259, "learning_rate": 0.0002, "epoch": 0.26387272021730696, "step": 340}, {"loss": 1.8221, "grad_norm": 0.312757670879364, "learning_rate": 0.0002, "epoch": 0.2716336825766395, "step": 350}, {"loss": 1.9031, "grad_norm": 0.326016366481781, "learning_rate": 0.0002, "epoch": 0.27939464493597205, "step": 360}, {"loss": 1.8214, "grad_norm": 0.34093883633613586, "learning_rate": 0.0002, "epoch": 0.2871556072953046, "step": 370}, {"loss": 1.7733, "grad_norm": 0.32325029373168945, "learning_rate": 0.0002, "epoch": 0.2949165696546372, "step": 380}, {"loss": 1.842, "grad_norm": 0.34105437994003296, "learning_rate": 0.0002, "epoch": 0.30267753201396974, "step": 390}, {"loss": 1.7926, "grad_norm": 0.32565295696258545, "learning_rate": 0.0002, "epoch": 0.3104384943733023, "step": 400}, {"loss": 1.8031, "grad_norm": 0.32742050290107727, "learning_rate": 0.0002, "epoch": 0.31819945673263483, "step": 410}, {"loss": 1.907, "grad_norm": 0.30233046412467957, "learning_rate": 0.0002, "epoch": 0.3259604190919674, "step": 420}, {"loss": 1.7623, "grad_norm": 0.32419222593307495, "learning_rate": 0.0002, "epoch": 0.3337213814513, "step": 430}, {"loss": 1.865, "grad_norm": 0.3653007745742798, "learning_rate": 0.0002, "epoch": 0.3414823438106325, "step": 440}, {"loss": 1.8044, "grad_norm": 0.31617099046707153, "learning_rate": 0.0002, "epoch": 0.3492433061699651, "step": 450}, {"loss": 1.7677, "grad_norm": 0.3305962085723877, "learning_rate": 0.0002, "epoch": 0.3570042685292976, "step": 460}, {"loss": 1.8155, "grad_norm": 0.3178933262825012, "learning_rate": 0.0002, "epoch": 0.36476523088863017, "step": 470}, {"loss": 1.7485, "grad_norm": 0.37163782119750977, "learning_rate": 0.0002, "epoch": 0.37252619324796277, "step": 480}, {"loss": 1.8804, "grad_norm": 0.469844788312912, "learning_rate": 0.0002, "epoch": 0.3802871556072953, "step": 490}, {"loss": 1.8343, "grad_norm": 0.3409338593482971, "learning_rate": 0.0002, "epoch": 0.38804811796662786, "step": 500}, {"loss": 1.8433, "grad_norm": 0.31943467259407043, "learning_rate": 0.0002, "epoch": 0.3958090803259604, "step": 510}, {"loss": 1.7873, "grad_norm": 0.32293614745140076, "learning_rate": 0.0002, "epoch": 0.40357004268529295, "step": 520}, {"loss": 1.8584, "grad_norm": 0.2994382977485657, "learning_rate": 0.0002, "epoch": 0.41133100504462555, "step": 530}, {"loss": 1.8153, "grad_norm": 0.3273141384124756, "learning_rate": 0.0002, "epoch": 0.4190919674039581, "step": 540}, {"loss": 1.8097, "grad_norm": 0.3020550012588501, "learning_rate": 0.0002, "epoch": 0.42685292976329064, "step": 550}, {"loss": 1.8551, "grad_norm": 0.30113112926483154, "learning_rate": 0.0002, "epoch": 0.4346138921226232, "step": 560}, {"loss": 1.8084, "grad_norm": 0.30274903774261475, "learning_rate": 0.0002, "epoch": 0.44237485448195574, "step": 570}, {"loss": 1.7673, "grad_norm": 0.3231128454208374, "learning_rate": 0.0002, "epoch": 0.45013581684128834, "step": 580}, {"loss": 1.7848, "grad_norm": 0.3255121409893036, "learning_rate": 0.0002, "epoch": 0.4578967792006209, "step": 590}, {"loss": 1.8227, "grad_norm": 0.30147507786750793, "learning_rate": 0.0002, "epoch": 0.46565774155995343, "step": 600}, {"loss": 1.7572, "grad_norm": 0.29781386256217957, "learning_rate": 0.0002, "epoch": 0.473418703919286, "step": 610}, {"loss": 1.8307, "grad_norm": 0.30914685130119324, "learning_rate": 0.0002, "epoch": 0.4811796662786185, "step": 620}, {"loss": 1.805, "grad_norm": 0.3110593855381012, "learning_rate": 0.0002, "epoch": 0.4889406286379511, "step": 630}, {"loss": 1.8228, "grad_norm": 0.3298132121562958, "learning_rate": 0.0002, "epoch": 0.49670159099728367, "step": 640}, {"loss": 1.7816, "grad_norm": 0.322122186422348, "learning_rate": 0.0002, "epoch": 0.5044625533566163, "step": 650}, {"loss": 1.8001, "grad_norm": 0.3504371643066406, "learning_rate": 0.0002, "epoch": 0.5122235157159488, "step": 660}, {"loss": 1.8682, "grad_norm": 0.3102182149887085, "learning_rate": 0.0002, "epoch": 0.5199844780752814, "step": 670}, {"loss": 1.7494, "grad_norm": 0.6113658547401428, "learning_rate": 0.0002, "epoch": 0.5277454404346139, "step": 680}, {"loss": 1.7096, "grad_norm": 0.31841862201690674, "learning_rate": 0.0002, "epoch": 0.5355064027939465, "step": 690}, {"loss": 1.7587, "grad_norm": 0.2830526530742645, "learning_rate": 0.0002, "epoch": 0.543267365153279, "step": 700}, {"loss": 1.7887, "grad_norm": 0.3048769533634186, "learning_rate": 0.0002, "epoch": 0.5510283275126115, "step": 710}, {"loss": 1.8416, "grad_norm": 0.2719033658504486, "learning_rate": 0.0002, "epoch": 0.5587892898719441, "step": 720}, {"loss": 1.786, "grad_norm": 0.3176722526550293, "learning_rate": 0.0002, "epoch": 0.5665502522312766, "step": 730}, {"loss": 1.7127, "grad_norm": 0.32491734623908997, "learning_rate": 0.0002, "epoch": 0.5743112145906092, "step": 740}, {"loss": 1.7892, "grad_norm": 0.32746851444244385, "learning_rate": 0.0002, "epoch": 0.5820721769499418, "step": 750}, {"loss": 1.7811, "grad_norm": 0.3055773973464966, "learning_rate": 0.0002, "epoch": 0.5898331393092744, "step": 760}, {"loss": 1.8597, "grad_norm": 0.30671584606170654, "learning_rate": 0.0002, "epoch": 0.5975941016686069, "step": 770}, {"loss": 1.7728, "grad_norm": 0.28770264983177185, "learning_rate": 0.0002, "epoch": 0.6053550640279395, "step": 780}, {"loss": 1.7025, "grad_norm": 0.2814285457134247, "learning_rate": 0.0002, "epoch": 0.613116026387272, "step": 790}, {"loss": 1.819, "grad_norm": 0.31554412841796875, "learning_rate": 0.0002, "epoch": 0.6208769887466046, "step": 800}, {"loss": 1.8335, "grad_norm": 0.2984226942062378, "learning_rate": 0.0002, "epoch": 0.6286379511059371, "step": 810}, {"loss": 1.7728, "grad_norm": 0.2859906554222107, "learning_rate": 0.0002, "epoch": 0.6363989134652697, "step": 820}, {"loss": 1.7408, "grad_norm": 0.2887928783893585, "learning_rate": 0.0002, "epoch": 0.6441598758246022, "step": 830}, {"loss": 1.8071, "grad_norm": 0.31287339329719543, "learning_rate": 0.0002, "epoch": 0.6519208381839348, "step": 840}, {"loss": 1.7673, "grad_norm": 0.32064181566238403, "learning_rate": 0.0002, "epoch": 0.6596818005432674, "step": 850}, {"loss": 1.7849, "grad_norm": 0.290981650352478, "learning_rate": 0.0002, "epoch": 0.6674427629026, "step": 860}, {"loss": 1.8089, "grad_norm": 0.33060121536254883, "learning_rate": 0.0002, "epoch": 0.6752037252619325, "step": 870}, {"loss": 1.7357, "grad_norm": 0.27032899856567383, "learning_rate": 0.0002, "epoch": 0.682964687621265, "step": 880}, {"loss": 1.8423, "grad_norm": 0.29031234979629517, "learning_rate": 0.0002, "epoch": 0.6907256499805976, "step": 890}, {"loss": 1.835, "grad_norm": 0.2845142185688019, "learning_rate": 0.0002, "epoch": 0.6984866123399301, "step": 900}, {"loss": 1.77, "grad_norm": 0.8638312816619873, "learning_rate": 0.0002, "epoch": 0.7062475746992627, "step": 910}, {"loss": 1.7757, "grad_norm": 0.3086668848991394, "learning_rate": 0.0002, "epoch": 0.7140085370585952, "step": 920}, {"loss": 1.7541, "grad_norm": 0.2724177837371826, "learning_rate": 0.0002, "epoch": 0.7217694994179278, "step": 930}, {"loss": 1.816, "grad_norm": 0.289559006690979, "learning_rate": 0.0002, "epoch": 0.7295304617772603, "step": 940}, {"loss": 1.7654, "grad_norm": 0.3000658452510834, "learning_rate": 0.0002, "epoch": 0.737291424136593, "step": 950}, {"loss": 1.7736, "grad_norm": 0.33544042706489563, "learning_rate": 0.0002, "epoch": 0.7450523864959255, "step": 960}, {"loss": 1.6979, "grad_norm": 0.28593236207962036, "learning_rate": 0.0002, "epoch": 0.7528133488552581, "step": 970}, {"loss": 1.8583, "grad_norm": 0.313634991645813, "learning_rate": 0.0002, "epoch": 0.7605743112145906, "step": 980}, {"loss": 1.7473, "grad_norm": 0.2949385941028595, "learning_rate": 0.0002, "epoch": 0.7683352735739232, "step": 990}, {"loss": 1.8689, "grad_norm": 0.2920108437538147, "learning_rate": 0.0002, "epoch": 0.7760962359332557, "step": 1000}, {"loss": 1.8401, "grad_norm": 0.3245100677013397, "learning_rate": 0.0002, "epoch": 0.7838571982925883, "step": 1010}, {"loss": 1.7109, "grad_norm": 0.3007619380950928, "learning_rate": 0.0002, "epoch": 0.7916181606519208, "step": 1020}, {"loss": 1.7427, "grad_norm": 0.3630852997303009, "learning_rate": 0.0002, "epoch": 0.7993791230112534, "step": 1030}, {"loss": 1.7655, "grad_norm": 0.2856379747390747, "learning_rate": 0.0002, "epoch": 0.8071400853705859, "step": 1040}, {"loss": 1.8371, "grad_norm": 0.32476478815078735, "learning_rate": 0.0002, "epoch": 0.8149010477299186, "step": 1050}, {"loss": 1.8039, "grad_norm": 0.5162565112113953, "learning_rate": 0.0002, "epoch": 0.8226620100892511, "step": 1060}, {"loss": 1.8862, "grad_norm": 0.316496342420578, "learning_rate": 0.0002, "epoch": 0.8304229724485837, "step": 1070}, {"loss": 1.8023, "grad_norm": 0.31977516412734985, "learning_rate": 0.0002, "epoch": 0.8381839348079162, "step": 1080}, {"loss": 1.8547, "grad_norm": 0.269509494304657, "learning_rate": 0.0002, "epoch": 0.8459448971672487, "step": 1090}, {"loss": 1.7811, "grad_norm": 0.31621453166007996, "learning_rate": 0.0002, "epoch": 0.8537058595265813, "step": 1100}, {"loss": 1.739, "grad_norm": 0.2946535050868988, "learning_rate": 0.0002, "epoch": 0.8614668218859138, "step": 1110}, {"loss": 1.7511, "grad_norm": 0.3088909983634949, "learning_rate": 0.0002, "epoch": 0.8692277842452464, "step": 1120}, {"loss": 1.8228, "grad_norm": 0.33033716678619385, "learning_rate": 0.0002, "epoch": 0.8769887466045789, "step": 1130}, {"loss": 1.7912, "grad_norm": 0.2954833507537842, "learning_rate": 0.0002, "epoch": 0.8847497089639115, "step": 1140}, {"loss": 1.8394, "grad_norm": 0.2950248122215271, "learning_rate": 0.0002, "epoch": 0.8925106713232441, "step": 1150}, {"loss": 1.7068, "grad_norm": 0.296661913394928, "learning_rate": 0.0002, "epoch": 0.9002716336825767, "step": 1160}, {"loss": 1.7967, "grad_norm": 0.35451310873031616, "learning_rate": 0.0002, "epoch": 0.9080325960419092, "step": 1170}, {"loss": 1.8202, "grad_norm": 0.32705947756767273, "learning_rate": 0.0002, "epoch": 0.9157935584012418, "step": 1180}, {"loss": 1.7396, "grad_norm": 0.3333960771560669, "learning_rate": 0.0002, "epoch": 0.9235545207605743, "step": 1190}, {"loss": 1.7801, "grad_norm": 0.3042232096195221, "learning_rate": 0.0002, "epoch": 0.9313154831199069, "step": 1200}, {"loss": 1.7586, "grad_norm": 0.281553715467453, "learning_rate": 0.0002, "epoch": 0.9390764454792394, "step": 1210}, {"loss": 1.7953, "grad_norm": 0.3096391558647156, "learning_rate": 0.0002, "epoch": 0.946837407838572, "step": 1220}, {"loss": 1.7401, "grad_norm": 0.2866271734237671, "learning_rate": 0.0002, "epoch": 0.9545983701979045, "step": 1230}, {"loss": 1.7211, "grad_norm": 0.28394097089767456, "learning_rate": 0.0002, "epoch": 0.962359332557237, "step": 1240}, {"loss": 1.7363, "grad_norm": 0.3249266743659973, "learning_rate": 0.0002, "epoch": 0.9701202949165697, "step": 1250}, {"loss": 1.7563, "grad_norm": 0.2896869480609894, "learning_rate": 0.0002, "epoch": 0.9778812572759022, "step": 1260}, {"loss": 1.6389, "grad_norm": 0.29224586486816406, "learning_rate": 0.0002, "epoch": 0.9856422196352348, "step": 1270}, {"loss": 1.7111, "grad_norm": 0.2820223569869995, "learning_rate": 0.0002, "epoch": 0.9934031819945673, "step": 1280}, {"eval_loss": 1.8081045150756836, "eval_runtime": 102.3056, "eval_samples_per_second": 4.956, "eval_steps_per_second": 0.626, "epoch": 0.9996119518820333, "step": 1288}, {"loss": 1.7518, "grad_norm": 0.3282551169395447, "learning_rate": 0.0002, "epoch": 1.0011641443538999, "step": 1290}, {"loss": 1.6806, "grad_norm": 0.30217495560646057, "learning_rate": 0.0002, "epoch": 1.0089251067132325, "step": 1300}, {"loss": 1.6777, "grad_norm": 0.30801767110824585, "learning_rate": 0.0002, "epoch": 1.016686069072565, "step": 1310}, {"loss": 1.7756, "grad_norm": 0.31816792488098145, "learning_rate": 0.0002, "epoch": 1.0244470314318976, "step": 1320}, {"loss": 1.6986, "grad_norm": 0.27794334292411804, "learning_rate": 0.0002, "epoch": 1.03220799379123, "step": 1330}, {"loss": 1.6931, "grad_norm": 0.3018926680088043, "learning_rate": 0.0002, "epoch": 1.0399689561505627, "step": 1340}, {"loss": 1.7033, "grad_norm": 0.3552975356578827, "learning_rate": 0.0002, "epoch": 1.0477299185098952, "step": 1350}, {"loss": 1.6782, "grad_norm": 0.32590144872665405, "learning_rate": 0.0002, "epoch": 1.0554908808692278, "step": 1360}, {"loss": 1.6479, "grad_norm": 0.3435460925102234, "learning_rate": 0.0002, "epoch": 1.0632518432285603, "step": 1370}, {"loss": 1.7451, "grad_norm": 0.35037797689437866, "learning_rate": 0.0002, "epoch": 1.071012805587893, "step": 1380}, {"loss": 1.7868, "grad_norm": 0.31398263573646545, "learning_rate": 0.0002, "epoch": 1.0787737679472253, "step": 1390}, {"loss": 1.6729, "grad_norm": 0.3134010434150696, "learning_rate": 0.0002, "epoch": 1.086534730306558, "step": 1400}, {"loss": 1.751, "grad_norm": 0.4599704444408417, "learning_rate": 0.0002, "epoch": 1.0942956926658907, "step": 1410}, {"loss": 1.6871, "grad_norm": 0.35852891206741333, "learning_rate": 0.0002, "epoch": 1.102056655025223, "step": 1420}, {"loss": 1.7083, "grad_norm": 0.35628634691238403, "learning_rate": 0.0002, "epoch": 1.1098176173845558, "step": 1430}, {"loss": 1.6166, "grad_norm": 0.3769161105155945, "learning_rate": 0.0002, "epoch": 1.1175785797438882, "step": 1440}, {"loss": 1.7344, "grad_norm": 1.3712416887283325, "learning_rate": 0.0002, "epoch": 1.1253395421032208, "step": 1450}, {"loss": 1.6542, "grad_norm": 0.38406670093536377, "learning_rate": 0.0002, "epoch": 1.1331005044625533, "step": 1460}, {"loss": 1.7104, "grad_norm": 0.3402116000652313, "learning_rate": 0.0002, "epoch": 1.140861466821886, "step": 1470}, {"loss": 1.7074, "grad_norm": 0.341189444065094, "learning_rate": 0.0002, "epoch": 1.1486224291812184, "step": 1480}, {"loss": 1.6468, "grad_norm": 0.36629995703697205, "learning_rate": 0.0002, "epoch": 1.156383391540551, "step": 1490}, {"loss": 1.6952, "grad_norm": 0.3499569296836853, "learning_rate": 0.0002, "epoch": 1.1641443538998835, "step": 1500}, {"loss": 1.6625, "grad_norm": 0.3663063943386078, "learning_rate": 0.0002, "epoch": 1.1719053162592161, "step": 1510}, {"loss": 1.7533, "grad_norm": 0.34851500391960144, "learning_rate": 0.0002, "epoch": 1.1796662786185488, "step": 1520}, {"loss": 1.6092, "grad_norm": 0.35071656107902527, "learning_rate": 0.0002, "epoch": 1.1874272409778812, "step": 1530}, {"loss": 1.7206, "grad_norm": 0.42783796787261963, "learning_rate": 0.0002, "epoch": 1.1951882033372139, "step": 1540}, {"loss": 1.7499, "grad_norm": 0.31830692291259766, "learning_rate": 0.0002, "epoch": 1.2029491656965463, "step": 1550}, {"loss": 1.7372, "grad_norm": 0.3597424626350403, "learning_rate": 0.0002, "epoch": 1.210710128055879, "step": 1560}, {"loss": 1.6386, "grad_norm": 0.35233765840530396, "learning_rate": 0.0002, "epoch": 1.2184710904152114, "step": 1570}, {"loss": 1.6766, "grad_norm": 0.35942912101745605, "learning_rate": 0.0002, "epoch": 1.226232052774544, "step": 1580}, {"loss": 1.6598, "grad_norm": 0.36159393191337585, "learning_rate": 0.0002, "epoch": 1.2339930151338767, "step": 1590}, {"loss": 1.6697, "grad_norm": 0.3328469693660736, "learning_rate": 0.0002, "epoch": 1.2417539774932091, "step": 1600}, {"loss": 1.7594, "grad_norm": 0.3089476525783539, "learning_rate": 0.0002, "epoch": 1.2495149398525418, "step": 1610}, {"loss": 1.6805, "grad_norm": 0.30947765707969666, "learning_rate": 0.0002, "epoch": 1.2572759022118742, "step": 1620}, {"loss": 1.6899, "grad_norm": 0.32154011726379395, "learning_rate": 0.0002, "epoch": 1.265036864571207, "step": 1630}, {"loss": 1.6621, "grad_norm": 0.3480297923088074, "learning_rate": 0.0002, "epoch": 1.2727978269305393, "step": 1640}, {"loss": 1.7087, "grad_norm": 0.39471694827079773, "learning_rate": 0.0002, "epoch": 1.280558789289872, "step": 1650}, {"loss": 1.7608, "grad_norm": 0.35728853940963745, "learning_rate": 0.0002, "epoch": 1.2883197516492044, "step": 1660}, {"loss": 1.7008, "grad_norm": 0.35223081707954407, "learning_rate": 0.0002, "epoch": 1.296080714008537, "step": 1670}, {"loss": 1.7253, "grad_norm": 0.3588867485523224, "learning_rate": 0.0002, "epoch": 1.3038416763678695, "step": 1680}, {"loss": 1.6505, "grad_norm": 0.3528042733669281, "learning_rate": 0.0002, "epoch": 1.3116026387272022, "step": 1690}, {"loss": 1.6945, "grad_norm": 0.35975801944732666, "learning_rate": 0.0002, "epoch": 1.3193636010865348, "step": 1700}, {"loss": 1.6631, "grad_norm": 0.36691880226135254, "learning_rate": 0.0002, "epoch": 1.3271245634458673, "step": 1710}, {"loss": 1.7593, "grad_norm": 0.3787977695465088, "learning_rate": 0.0002, "epoch": 1.3348855258052, "step": 1720}, {"loss": 1.7697, "grad_norm": 0.36614933609962463, "learning_rate": 0.0002, "epoch": 1.3426464881645324, "step": 1730}, {"loss": 1.6487, "grad_norm": 0.3484745919704437, "learning_rate": 0.0002, "epoch": 1.350407450523865, "step": 1740}, {"loss": 1.7054, "grad_norm": 0.36905673146247864, "learning_rate": 0.0002, "epoch": 1.3581684128831975, "step": 1750}, {"loss": 1.7679, "grad_norm": 0.41564738750457764, "learning_rate": 0.0002, "epoch": 1.36592937524253, "step": 1760}, {"loss": 1.6634, "grad_norm": 0.3345205783843994, "learning_rate": 0.0002, "epoch": 1.3736903376018628, "step": 1770}, {"loss": 1.7275, "grad_norm": 0.34926071763038635, "learning_rate": 0.0002, "epoch": 1.3814512999611952, "step": 1780}, {"loss": 1.685, "grad_norm": 0.42004233598709106, "learning_rate": 0.0002, "epoch": 1.3892122623205276, "step": 1790}, {"loss": 1.666, "grad_norm": 0.3576236963272095, "learning_rate": 0.0002, "epoch": 1.3969732246798603, "step": 1800}, {"loss": 1.8516, "grad_norm": 0.3586704432964325, "learning_rate": 0.0002, "epoch": 1.404734187039193, "step": 1810}, {"loss": 1.6171, "grad_norm": 0.3943439722061157, "learning_rate": 0.0002, "epoch": 1.4124951493985254, "step": 1820}, {"loss": 1.6865, "grad_norm": 0.3484877049922943, "learning_rate": 0.0002, "epoch": 1.420256111757858, "step": 1830}, {"loss": 1.7205, "grad_norm": 0.3344518840312958, "learning_rate": 0.0002, "epoch": 1.4280170741171905, "step": 1840}, {"loss": 1.6999, "grad_norm": 0.4345698356628418, "learning_rate": 0.0002, "epoch": 1.4357780364765231, "step": 1850}, {"loss": 1.6855, "grad_norm": 0.5525162220001221, "learning_rate": 0.0002, "epoch": 1.4435389988358556, "step": 1860}, {"loss": 1.7143, "grad_norm": 0.37194496393203735, "learning_rate": 0.0002, "epoch": 1.4512999611951882, "step": 1870}, {"loss": 1.7623, "grad_norm": 0.34570157527923584, "learning_rate": 0.0002, "epoch": 1.4590609235545209, "step": 1880}, {"loss": 1.7, "grad_norm": 0.3512282073497772, "learning_rate": 0.0002, "epoch": 1.4668218859138533, "step": 1890}, {"loss": 1.7225, "grad_norm": 0.3443922996520996, "learning_rate": 0.0002, "epoch": 1.4745828482731858, "step": 1900}, {"loss": 1.7393, "grad_norm": 0.3812018036842346, "learning_rate": 0.0002, "epoch": 1.4823438106325184, "step": 1910}, {"loss": 1.7277, "grad_norm": 0.39263492822647095, "learning_rate": 0.0002, "epoch": 1.490104772991851, "step": 1920}, {"loss": 1.6829, "grad_norm": 0.3146156072616577, "learning_rate": 0.0002, "epoch": 1.4978657353511835, "step": 1930}, {"loss": 1.6881, "grad_norm": 0.3653988540172577, "learning_rate": 0.0002, "epoch": 1.505626697710516, "step": 1940}, {"loss": 1.7064, "grad_norm": 0.3966596722602844, "learning_rate": 0.0002, "epoch": 1.5133876600698488, "step": 1950}, {"loss": 1.6942, "grad_norm": 0.3441697359085083, "learning_rate": 0.0002, "epoch": 1.5211486224291813, "step": 1960}, {"loss": 1.7175, "grad_norm": 0.3328564465045929, "learning_rate": 0.0002, "epoch": 1.5289095847885137, "step": 1970}, {"loss": 1.7394, "grad_norm": 0.34068772196769714, "learning_rate": 0.0002, "epoch": 1.5366705471478463, "step": 1980}, {"loss": 1.7016, "grad_norm": 0.3559795916080475, "learning_rate": 0.0002, "epoch": 1.544431509507179, "step": 1990}, {"loss": 1.7102, "grad_norm": 0.37888768315315247, "learning_rate": 0.0002, "epoch": 1.5521924718665114, "step": 2000}, {"loss": 1.7094, "grad_norm": 0.36128363013267517, "learning_rate": 0.0002, "epoch": 1.5599534342258439, "step": 2010}, {"loss": 1.6407, "grad_norm": 0.3643714487552643, "learning_rate": 0.0002, "epoch": 1.5677143965851765, "step": 2020}, {"loss": 1.6777, "grad_norm": 0.3863612115383148, "learning_rate": 0.0002, "epoch": 1.5754753589445092, "step": 2030}, {"loss": 1.6575, "grad_norm": 0.32831457257270813, "learning_rate": 0.0002, "epoch": 1.5832363213038416, "step": 2040}, {"loss": 1.7404, "grad_norm": 0.36098113656044006, "learning_rate": 0.0002, "epoch": 1.5909972836631743, "step": 2050}, {"loss": 1.7065, "grad_norm": 1.1079334020614624, "learning_rate": 0.0002, "epoch": 1.598758246022507, "step": 2060}, {"loss": 1.6824, "grad_norm": 0.35615381598472595, "learning_rate": 0.0002, "epoch": 1.6065192083818394, "step": 2070}, {"loss": 1.7262, "grad_norm": 0.369711309671402, "learning_rate": 0.0002, "epoch": 1.6142801707411718, "step": 2080}, {"loss": 1.6995, "grad_norm": 0.390658438205719, "learning_rate": 0.0002, "epoch": 1.6220411331005045, "step": 2090}, {"loss": 1.6996, "grad_norm": 0.3422999382019043, "learning_rate": 0.0002, "epoch": 1.6298020954598371, "step": 2100}, {"loss": 1.7135, "grad_norm": 0.372475266456604, "learning_rate": 0.0002, "epoch": 1.6375630578191696, "step": 2110}, {"loss": 1.7216, "grad_norm": 0.35660576820373535, "learning_rate": 0.0002, "epoch": 1.645324020178502, "step": 2120}, {"loss": 1.6991, "grad_norm": 0.35754942893981934, "learning_rate": 0.0002, "epoch": 1.6530849825378346, "step": 2130}, {"loss": 1.6779, "grad_norm": 0.34572410583496094, "learning_rate": 0.0002, "epoch": 1.6608459448971673, "step": 2140}, {"loss": 1.6707, "grad_norm": 0.42059701681137085, "learning_rate": 0.0002, "epoch": 1.6686069072564997, "step": 2150}, {"loss": 1.6782, "grad_norm": 0.35200759768486023, "learning_rate": 0.0002, "epoch": 1.6763678696158324, "step": 2160}, {"loss": 1.6869, "grad_norm": 0.3704029321670532, "learning_rate": 0.0002, "epoch": 1.684128831975165, "step": 2170}, {"loss": 1.7192, "grad_norm": 0.40450501441955566, "learning_rate": 0.0002, "epoch": 1.6918897943344975, "step": 2180}, {"loss": 1.6228, "grad_norm": 0.362966924905777, "learning_rate": 0.0002, "epoch": 1.69965075669383, "step": 2190}, {"loss": 1.6935, "grad_norm": 0.36586204171180725, "learning_rate": 0.0002, "epoch": 1.7074117190531626, "step": 2200}, {"loss": 1.6088, "grad_norm": 0.3295372426509857, "learning_rate": 0.0002, "epoch": 1.7151726814124952, "step": 2210}, {"loss": 1.7844, "grad_norm": 0.3892575800418854, "learning_rate": 0.0002, "epoch": 1.7229336437718277, "step": 2220}, {"loss": 1.7805, "grad_norm": 0.34712135791778564, "learning_rate": 0.0002, "epoch": 1.73069460613116, "step": 2230}, {"loss": 1.7353, "grad_norm": 0.34801796078681946, "learning_rate": 0.0002, "epoch": 1.738455568490493, "step": 2240}, {"loss": 1.7009, "grad_norm": 0.3822397291660309, "learning_rate": 0.0002, "epoch": 1.7462165308498254, "step": 2250}, {"loss": 1.6546, "grad_norm": 0.38933250308036804, "learning_rate": 0.0002, "epoch": 1.7539774932091579, "step": 2260}, {"loss": 1.7245, "grad_norm": 0.3798373341560364, "learning_rate": 0.0002, "epoch": 1.7617384555684905, "step": 2270}, {"loss": 1.6508, "grad_norm": 0.35151317715644836, "learning_rate": 0.0002, "epoch": 1.7694994179278232, "step": 2280}, {"loss": 1.6894, "grad_norm": 0.44981494545936584, "learning_rate": 0.0002, "epoch": 1.7772603802871556, "step": 2290}, {"loss": 1.7271, "grad_norm": 0.3992624580860138, "learning_rate": 0.0002, "epoch": 1.785021342646488, "step": 2300}, {"loss": 1.7252, "grad_norm": 0.3772512376308441, "learning_rate": 0.0002, "epoch": 1.7927823050058207, "step": 2310}, {"loss": 1.7057, "grad_norm": 0.3511589467525482, "learning_rate": 0.0002, "epoch": 1.8005432673651534, "step": 2320}, {"loss": 1.764, "grad_norm": 0.3805285394191742, "learning_rate": 0.0002, "epoch": 1.8083042297244858, "step": 2330}, {"loss": 1.6986, "grad_norm": 0.3792071044445038, "learning_rate": 0.0002, "epoch": 1.8160651920838184, "step": 2340}, {"loss": 1.7759, "grad_norm": 0.36430829763412476, "learning_rate": 0.0002, "epoch": 1.823826154443151, "step": 2350}, {"loss": 1.6773, "grad_norm": 0.36502477526664734, "learning_rate": 0.0002, "epoch": 1.8315871168024835, "step": 2360}, {"loss": 1.8072, "grad_norm": 0.35015153884887695, "learning_rate": 0.0002, "epoch": 1.839348079161816, "step": 2370}, {"loss": 1.7734, "grad_norm": 0.3710903823375702, "learning_rate": 0.0002, "epoch": 1.8471090415211486, "step": 2380}, {"loss": 1.6737, "grad_norm": 0.3542828857898712, "learning_rate": 0.0002, "epoch": 1.8548700038804813, "step": 2390}, {"loss": 1.6783, "grad_norm": 0.35467568039894104, "learning_rate": 0.0002, "epoch": 1.8626309662398137, "step": 2400}, {"loss": 1.7773, "grad_norm": 0.3638560473918915, "learning_rate": 0.0002, "epoch": 1.8703919285991462, "step": 2410}, {"loss": 1.7019, "grad_norm": 0.3823298215866089, "learning_rate": 0.0002, "epoch": 1.8781528909584788, "step": 2420}, {"loss": 1.6935, "grad_norm": 0.3926416337490082, "learning_rate": 0.0002, "epoch": 1.8859138533178115, "step": 2430}, {"loss": 1.71, "grad_norm": 0.3608079254627228, "learning_rate": 0.0002, "epoch": 1.893674815677144, "step": 2440}, {"loss": 1.6654, "grad_norm": 0.3426613509654999, "learning_rate": 0.0002, "epoch": 1.9014357780364766, "step": 2450}, {"loss": 1.6892, "grad_norm": 0.3522338569164276, "learning_rate": 0.0002, "epoch": 1.9091967403958092, "step": 2460}, {"loss": 1.7307, "grad_norm": 0.3608049154281616, "learning_rate": 0.0002, "epoch": 1.9169577027551417, "step": 2470}, {"loss": 1.6823, "grad_norm": 0.3849755525588989, "learning_rate": 0.0002, "epoch": 1.924718665114474, "step": 2480}, {"loss": 1.7518, "grad_norm": 0.4154011011123657, "learning_rate": 0.0002, "epoch": 1.9324796274738067, "step": 2490}, {"loss": 1.7381, "grad_norm": 0.3602796792984009, "learning_rate": 0.0002, "epoch": 1.9402405898331394, "step": 2500}, {"loss": 1.7843, "grad_norm": 0.3702992796897888, "learning_rate": 0.0002, "epoch": 1.9480015521924718, "step": 2510}, {"loss": 1.6669, "grad_norm": 0.3657735288143158, "learning_rate": 0.0002, "epoch": 1.9557625145518043, "step": 2520}, {"loss": 1.5964, "grad_norm": 0.41031739115715027, "learning_rate": 0.0002, "epoch": 1.963523476911137, "step": 2530}, {"loss": 1.6745, "grad_norm": 0.34578680992126465, "learning_rate": 0.0002, "epoch": 1.9712844392704696, "step": 2540}, {"loss": 1.723, "grad_norm": 0.3361521065235138, "learning_rate": 0.0002, "epoch": 1.979045401629802, "step": 2550}, {"loss": 1.6868, "grad_norm": 0.34342363476753235, "learning_rate": 0.0002, "epoch": 1.9868063639891347, "step": 2560}, {"loss": 1.6577, "grad_norm": 0.32954007387161255, "learning_rate": 0.0002, "epoch": 1.9945673263484673, "step": 2570}, {"eval_loss": 1.8068748712539673, "eval_runtime": 105.5885, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.606, "epoch": 2.0, "step": 2577}, {"loss": 1.634, "grad_norm": 0.336302250623703, "learning_rate": 0.0002, "epoch": 2.0023282887077998, "step": 2580}, {"loss": 1.612, "grad_norm": 0.3627048432826996, "learning_rate": 0.0002, "epoch": 2.010089251067132, "step": 2590}, {"loss": 1.4908, "grad_norm": 0.38406702876091003, "learning_rate": 0.0002, "epoch": 2.017850213426465, "step": 2600}, {"loss": 1.5368, "grad_norm": 0.5326781272888184, "learning_rate": 0.0002, "epoch": 2.0256111757857975, "step": 2610}, {"loss": 1.5727, "grad_norm": 0.4774554967880249, "learning_rate": 0.0002, "epoch": 2.03337213814513, "step": 2620}, {"loss": 1.5422, "grad_norm": 0.4251810312271118, "learning_rate": 0.0002, "epoch": 2.0411331005044624, "step": 2630}, {"loss": 1.5152, "grad_norm": 0.4693007171154022, "learning_rate": 0.0002, "epoch": 2.0488940628637953, "step": 2640}, {"loss": 1.6137, "grad_norm": 0.46371519565582275, "learning_rate": 0.0002, "epoch": 2.0566550252231277, "step": 2650}, {"loss": 1.6304, "grad_norm": 0.46652570366859436, "learning_rate": 0.0002, "epoch": 2.06441598758246, "step": 2660}, {"loss": 1.6022, "grad_norm": 0.45200315117836, "learning_rate": 0.0002, "epoch": 2.0721769499417926, "step": 2670}, {"loss": 1.5358, "grad_norm": 0.42905205488204956, "learning_rate": 0.0002, "epoch": 2.0799379123011255, "step": 2680}, {"loss": 1.5401, "grad_norm": 0.44509148597717285, "learning_rate": 0.0002, "epoch": 2.087698874660458, "step": 2690}, {"loss": 1.5303, "grad_norm": 0.4445319175720215, "learning_rate": 0.0002, "epoch": 2.0954598370197903, "step": 2700}, {"loss": 1.5701, "grad_norm": 0.46825504302978516, "learning_rate": 0.0002, "epoch": 2.103220799379123, "step": 2710}, {"loss": 1.5751, "grad_norm": 0.4623856842517853, "learning_rate": 0.0002, "epoch": 2.1109817617384556, "step": 2720}, {"loss": 1.5601, "grad_norm": 0.4833452105522156, "learning_rate": 0.0002, "epoch": 2.118742724097788, "step": 2730}, {"loss": 1.5997, "grad_norm": 0.4582686722278595, "learning_rate": 0.0002, "epoch": 2.1265036864571205, "step": 2740}, {"loss": 1.5801, "grad_norm": 0.47587934136390686, "learning_rate": 0.0002, "epoch": 2.1342646488164534, "step": 2750}, {"loss": 1.594, "grad_norm": 0.4602217972278595, "learning_rate": 0.0002, "epoch": 2.142025611175786, "step": 2760}, {"loss": 1.5271, "grad_norm": 0.47501352429389954, "learning_rate": 0.0002, "epoch": 2.1497865735351183, "step": 2770}, {"loss": 1.4862, "grad_norm": 0.5078499913215637, "learning_rate": 0.0002, "epoch": 2.1575475358944507, "step": 2780}, {"loss": 1.6236, "grad_norm": 0.497704416513443, "learning_rate": 0.0002, "epoch": 2.1653084982537836, "step": 2790}, {"loss": 1.5597, "grad_norm": 0.5435971617698669, "learning_rate": 0.0002, "epoch": 2.173069460613116, "step": 2800}, {"loss": 1.5926, "grad_norm": 0.5172356367111206, "learning_rate": 0.0002, "epoch": 2.1808304229724484, "step": 2810}, {"loss": 1.5202, "grad_norm": 0.44063422083854675, "learning_rate": 0.0002, "epoch": 2.1885913853317813, "step": 2820}, {"loss": 1.6041, "grad_norm": 0.5079569220542908, "learning_rate": 0.0002, "epoch": 2.1963523476911138, "step": 2830}, {"loss": 1.5915, "grad_norm": 0.45658132433891296, "learning_rate": 0.0002, "epoch": 2.204113310050446, "step": 2840}, {"loss": 1.5546, "grad_norm": 0.5103023648262024, "learning_rate": 0.0002, "epoch": 2.2118742724097786, "step": 2850}, {"loss": 1.6197, "grad_norm": 0.4882226288318634, "learning_rate": 0.0002, "epoch": 2.2196352347691115, "step": 2860}, {"loss": 1.5996, "grad_norm": 0.5087296962738037, "learning_rate": 0.0002, "epoch": 2.227396197128444, "step": 2870}, {"loss": 1.5451, "grad_norm": 0.45293712615966797, "learning_rate": 0.0002, "epoch": 2.2351571594877764, "step": 2880}, {"loss": 1.6214, "grad_norm": 0.5120379328727722, "learning_rate": 0.0002, "epoch": 2.242918121847109, "step": 2890}, {"loss": 1.5273, "grad_norm": 0.47126415371894836, "learning_rate": 0.0002, "epoch": 2.2506790842064417, "step": 2900}, {"loss": 1.612, "grad_norm": 0.44005846977233887, "learning_rate": 0.0002, "epoch": 2.258440046565774, "step": 2910}, {"loss": 1.6023, "grad_norm": 0.46476176381111145, "learning_rate": 0.0002, "epoch": 2.2662010089251066, "step": 2920}, {"loss": 1.6417, "grad_norm": 0.48051515221595764, "learning_rate": 0.0002, "epoch": 2.2739619712844394, "step": 2930}, {"loss": 1.587, "grad_norm": 0.480069637298584, "learning_rate": 0.0002, "epoch": 2.281722933643772, "step": 2940}, {"loss": 1.5747, "grad_norm": 0.5122102499008179, "learning_rate": 0.0002, "epoch": 2.2894838960031043, "step": 2950}, {"loss": 1.5183, "grad_norm": 0.48879891633987427, "learning_rate": 0.0002, "epoch": 2.2972448583624367, "step": 2960}, {"loss": 1.5483, "grad_norm": 0.4973136782646179, "learning_rate": 0.0002, "epoch": 2.3050058207217696, "step": 2970}, {"loss": 1.677, "grad_norm": 0.5522695183753967, "learning_rate": 0.0002, "epoch": 2.312766783081102, "step": 2980}, {"loss": 1.5946, "grad_norm": 0.5220217704772949, "learning_rate": 0.0002, "epoch": 2.3205277454404345, "step": 2990}, {"loss": 1.6299, "grad_norm": 0.4978662431240082, "learning_rate": 0.0002, "epoch": 2.328288707799767, "step": 3000}, {"loss": 1.5498, "grad_norm": 0.554053544998169, "learning_rate": 0.0002, "epoch": 2.3360496701591, "step": 3010}, {"loss": 1.5356, "grad_norm": 0.4703886806964874, "learning_rate": 0.0002, "epoch": 2.3438106325184322, "step": 3020}, {"loss": 1.5418, "grad_norm": 0.5074123740196228, "learning_rate": 0.0002, "epoch": 2.3515715948777647, "step": 3030}, {"loss": 1.6873, "grad_norm": 0.5088278651237488, "learning_rate": 0.0002, "epoch": 2.3593325572370976, "step": 3040}, {"loss": 1.5249, "grad_norm": 0.4752114415168762, "learning_rate": 0.0002, "epoch": 2.36709351959643, "step": 3050}, {"loss": 1.5353, "grad_norm": 0.5121659636497498, "learning_rate": 0.0002, "epoch": 2.3748544819557624, "step": 3060}, {"loss": 1.6426, "grad_norm": 0.48649218678474426, "learning_rate": 0.0002, "epoch": 2.3826154443150953, "step": 3070}, {"loss": 1.6136, "grad_norm": 0.5209488868713379, "learning_rate": 0.0002, "epoch": 2.3903764066744277, "step": 3080}, {"loss": 1.597, "grad_norm": 0.5110517740249634, "learning_rate": 0.0002, "epoch": 2.39813736903376, "step": 3090}, {"loss": 1.5773, "grad_norm": 0.5609337091445923, "learning_rate": 0.0002, "epoch": 2.4058983313930926, "step": 3100}, {"loss": 1.5438, "grad_norm": 0.5191826224327087, "learning_rate": 0.0002, "epoch": 2.4136592937524255, "step": 3110}, {"loss": 1.6347, "grad_norm": 0.4876069724559784, "learning_rate": 0.0002, "epoch": 2.421420256111758, "step": 3120}, {"loss": 1.5565, "grad_norm": 0.4713933765888214, "learning_rate": 0.0002, "epoch": 2.4291812184710904, "step": 3130}, {"loss": 1.6388, "grad_norm": 0.5102227330207825, "learning_rate": 0.0002, "epoch": 2.436942180830423, "step": 3140}, {"loss": 1.5667, "grad_norm": 0.44546666741371155, "learning_rate": 0.0002, "epoch": 2.4447031431897557, "step": 3150}, {"loss": 1.5973, "grad_norm": 0.5167558193206787, "learning_rate": 0.0002, "epoch": 2.452464105549088, "step": 3160}, {"loss": 1.5673, "grad_norm": 0.5226958990097046, "learning_rate": 0.0002, "epoch": 2.4602250679084205, "step": 3170}, {"loss": 1.5758, "grad_norm": 0.4751799702644348, "learning_rate": 0.0002, "epoch": 2.4679860302677534, "step": 3180}, {"loss": 1.6234, "grad_norm": 0.4744729697704315, "learning_rate": 0.0002, "epoch": 2.475746992627086, "step": 3190}, {"loss": 1.5661, "grad_norm": 0.5203230381011963, "learning_rate": 0.0002, "epoch": 2.4835079549864183, "step": 3200}, {"loss": 1.493, "grad_norm": 0.47209781408309937, "learning_rate": 0.0002, "epoch": 2.4912689173457507, "step": 3210}, {"loss": 1.6415, "grad_norm": 0.5241674780845642, "learning_rate": 0.0002, "epoch": 2.4990298797050836, "step": 3220}, {"loss": 1.6324, "grad_norm": 0.5152244567871094, "learning_rate": 0.0002, "epoch": 2.506790842064416, "step": 3230}, {"loss": 1.6248, "grad_norm": 0.5216741561889648, "learning_rate": 0.0002, "epoch": 2.5145518044237485, "step": 3240}, {"loss": 1.5668, "grad_norm": 0.4953259527683258, "learning_rate": 0.0002, "epoch": 2.522312766783081, "step": 3250}, {"loss": 1.666, "grad_norm": 0.5973829030990601, "learning_rate": 0.0002, "epoch": 2.530073729142414, "step": 3260}, {"loss": 1.5295, "grad_norm": 0.48804202675819397, "learning_rate": 0.0002, "epoch": 2.5378346915017462, "step": 3270}, {"loss": 1.4954, "grad_norm": 0.5334644317626953, "learning_rate": 0.0002, "epoch": 2.5455956538610787, "step": 3280}, {"loss": 1.5814, "grad_norm": 0.46873313188552856, "learning_rate": 0.0002, "epoch": 2.5533566162204115, "step": 3290}, {"loss": 1.5362, "grad_norm": 0.4282589554786682, "learning_rate": 0.0002, "epoch": 2.561117578579744, "step": 3300}, {"loss": 1.6278, "grad_norm": 0.4848293960094452, "learning_rate": 0.0002, "epoch": 2.5688785409390764, "step": 3310}, {"loss": 1.6308, "grad_norm": 0.5093745589256287, "learning_rate": 0.0002, "epoch": 2.576639503298409, "step": 3320}, {"loss": 1.6375, "grad_norm": 0.5084842443466187, "learning_rate": 0.0002, "epoch": 2.5844004656577413, "step": 3330}, {"loss": 1.6168, "grad_norm": 0.4696281850337982, "learning_rate": 0.0002, "epoch": 2.592161428017074, "step": 3340}, {"loss": 1.5359, "grad_norm": 0.5767765641212463, "learning_rate": 0.0002, "epoch": 2.5999223903764066, "step": 3350}, {"loss": 1.6097, "grad_norm": 0.47300875186920166, "learning_rate": 0.0002, "epoch": 2.607683352735739, "step": 3360}, {"loss": 1.6138, "grad_norm": 0.4809158146381378, "learning_rate": 0.0002, "epoch": 2.615444315095072, "step": 3370}, {"loss": 1.4952, "grad_norm": 0.5141063928604126, "learning_rate": 0.0002, "epoch": 2.6232052774544043, "step": 3380}, {"loss": 1.5784, "grad_norm": 0.4832935035228729, "learning_rate": 0.0002, "epoch": 2.630966239813737, "step": 3390}, {"loss": 1.5796, "grad_norm": 0.5044625401496887, "learning_rate": 0.0002, "epoch": 2.6387272021730697, "step": 3400}, {"loss": 1.6202, "grad_norm": 0.5287680625915527, "learning_rate": 0.0002, "epoch": 2.646488164532402, "step": 3410}, {"loss": 1.5423, "grad_norm": 0.5306379795074463, "learning_rate": 0.0002, "epoch": 2.6542491268917345, "step": 3420}, {"loss": 1.5264, "grad_norm": 0.5849291682243347, "learning_rate": 0.0002, "epoch": 2.662010089251067, "step": 3430}, {"loss": 1.5937, "grad_norm": 0.7951080799102783, "learning_rate": 0.0002, "epoch": 2.6697710516104, "step": 3440}, {"loss": 1.5791, "grad_norm": 0.48087653517723083, "learning_rate": 0.0002, "epoch": 2.6775320139697323, "step": 3450}, {"loss": 1.6769, "grad_norm": 0.5396431684494019, "learning_rate": 0.0002, "epoch": 2.6852929763290647, "step": 3460}, {"loss": 1.606, "grad_norm": 0.5481634736061096, "learning_rate": 0.0002, "epoch": 2.693053938688397, "step": 3470}, {"loss": 1.6436, "grad_norm": 0.5068731307983398, "learning_rate": 0.0002, "epoch": 2.70081490104773, "step": 3480}, {"loss": 1.5738, "grad_norm": 0.5759826898574829, "learning_rate": 0.0002, "epoch": 2.7085758634070625, "step": 3490}, {"loss": 1.596, "grad_norm": 0.7253932952880859, "learning_rate": 0.0002, "epoch": 2.716336825766395, "step": 3500}, {"loss": 1.5791, "grad_norm": 0.527745246887207, "learning_rate": 0.0002, "epoch": 2.724097788125728, "step": 3510}, {"loss": 1.5874, "grad_norm": 0.5279242396354675, "learning_rate": 0.0002, "epoch": 2.73185875048506, "step": 3520}, {"loss": 1.6768, "grad_norm": 0.5047839283943176, "learning_rate": 0.0002, "epoch": 2.7396197128443927, "step": 3530}, {"loss": 1.5517, "grad_norm": 0.5430883169174194, "learning_rate": 0.0002, "epoch": 2.7473806752037255, "step": 3540}, {"loss": 1.5624, "grad_norm": 0.4496723711490631, "learning_rate": 0.0002, "epoch": 2.755141637563058, "step": 3550}, {"loss": 1.5789, "grad_norm": 0.5063338875770569, "learning_rate": 0.0002, "epoch": 2.7629025999223904, "step": 3560}, {"loss": 1.52, "grad_norm": 0.4619026780128479, "learning_rate": 0.0002, "epoch": 2.770663562281723, "step": 3570}, {"loss": 1.5793, "grad_norm": 0.4753304123878479, "learning_rate": 0.0002, "epoch": 2.7784245246410553, "step": 3580}, {"loss": 1.5715, "grad_norm": 0.5422708988189697, "learning_rate": 0.0002, "epoch": 2.786185487000388, "step": 3590}, {"loss": 1.5926, "grad_norm": 0.4756578803062439, "learning_rate": 0.0002, "epoch": 2.7939464493597206, "step": 3600}, {"loss": 1.5358, "grad_norm": 0.5057567358016968, "learning_rate": 0.0002, "epoch": 2.801707411719053, "step": 3610}, {"loss": 1.6131, "grad_norm": 0.5410919785499573, "learning_rate": 0.0002, "epoch": 2.809468374078386, "step": 3620}, {"loss": 1.5573, "grad_norm": 0.4958136975765228, "learning_rate": 0.0002, "epoch": 2.8172293364377183, "step": 3630}, {"loss": 1.6324, "grad_norm": 0.454527348279953, "learning_rate": 0.0002, "epoch": 2.8249902987970508, "step": 3640}, {"loss": 1.5582, "grad_norm": 0.5092706084251404, "learning_rate": 0.0002, "epoch": 2.8327512611563836, "step": 3650}, {"loss": 1.5893, "grad_norm": 0.5314022302627563, "learning_rate": 0.0002, "epoch": 2.840512223515716, "step": 3660}, {"loss": 1.588, "grad_norm": 0.5028239488601685, "learning_rate": 0.0002, "epoch": 2.8482731858750485, "step": 3670}, {"loss": 1.5751, "grad_norm": 0.5127444863319397, "learning_rate": 0.0002, "epoch": 2.856034148234381, "step": 3680}, {"loss": 1.6018, "grad_norm": 0.5045645236968994, "learning_rate": 0.0002, "epoch": 2.8637951105937134, "step": 3690}, {"loss": 1.5788, "grad_norm": 0.5560781955718994, "learning_rate": 0.0002, "epoch": 2.8715560729530463, "step": 3700}, {"loss": 1.5988, "grad_norm": 0.5177600383758545, "learning_rate": 0.0002, "epoch": 2.8793170353123787, "step": 3710}, {"loss": 1.6009, "grad_norm": 0.45830899477005005, "learning_rate": 0.0002, "epoch": 2.887077997671711, "step": 3720}, {"loss": 1.6344, "grad_norm": 0.4828629195690155, "learning_rate": 0.0002, "epoch": 2.894838960031044, "step": 3730}, {"loss": 1.6758, "grad_norm": 0.48241183161735535, "learning_rate": 0.0002, "epoch": 2.9025999223903765, "step": 3740}, {"loss": 1.5649, "grad_norm": 0.4909592568874359, "learning_rate": 0.0002, "epoch": 2.910360884749709, "step": 3750}, {"loss": 1.4927, "grad_norm": 0.44677025079727173, "learning_rate": 0.0002, "epoch": 2.9181218471090418, "step": 3760}, {"loss": 1.5067, "grad_norm": 0.4928834140300751, "learning_rate": 0.0002, "epoch": 2.925882809468374, "step": 3770}, {"loss": 1.5843, "grad_norm": 0.5673553347587585, "learning_rate": 0.0002, "epoch": 2.9336437718277066, "step": 3780}, {"loss": 1.5566, "grad_norm": 0.548190712928772, "learning_rate": 0.0002, "epoch": 2.941404734187039, "step": 3790}, {"loss": 1.5892, "grad_norm": 0.48979803919792175, "learning_rate": 0.0002, "epoch": 2.9491656965463715, "step": 3800}, {"loss": 1.5589, "grad_norm": 0.533191978931427, "learning_rate": 0.0002, "epoch": 2.9569266589057044, "step": 3810}, {"loss": 1.584, "grad_norm": 0.5362946391105652, "learning_rate": 0.0002, "epoch": 2.964687621265037, "step": 3820}, {"loss": 1.6602, "grad_norm": 0.4724906384944916, "learning_rate": 0.0002, "epoch": 2.9724485836243693, "step": 3830}, {"loss": 1.5834, "grad_norm": 0.5468461513519287, "learning_rate": 0.0002, "epoch": 2.980209545983702, "step": 3840}, {"loss": 1.6316, "grad_norm": 0.4697108864784241, "learning_rate": 0.0002, "epoch": 2.9879705083430346, "step": 3850}, {"loss": 1.6312, "grad_norm": 0.4780906140804291, "learning_rate": 0.0002, "epoch": 2.995731470702367, "step": 3860}, {"eval_loss": 1.8472607135772705, "eval_runtime": 106.5541, "eval_samples_per_second": 4.758, "eval_steps_per_second": 0.601, "epoch": 2.9996119518820334, "step": 3865}, {"loss": 1.4983, "grad_norm": 0.5645653605461121, "learning_rate": 0.0002, "epoch": 3.0034924330616994, "step": 3870}, {"loss": 1.4334, "grad_norm": 0.6457151174545288, "learning_rate": 0.0002, "epoch": 3.0112533954210323, "step": 3880}, {"loss": 1.3899, "grad_norm": 0.583838164806366, "learning_rate": 0.0002, "epoch": 3.0190143577803648, "step": 3890}, {"loss": 1.3258, "grad_norm": 0.6819260120391846, "learning_rate": 0.0002, "epoch": 3.026775320139697, "step": 3900}, {"loss": 1.3458, "grad_norm": 0.6692903637886047, "learning_rate": 0.0002, "epoch": 3.03453628249903, "step": 3910}, {"loss": 1.4356, "grad_norm": 0.6101024746894836, "learning_rate": 0.0002, "epoch": 3.0422972448583625, "step": 3920}, {"loss": 1.394, "grad_norm": 0.7014093399047852, "learning_rate": 0.0002, "epoch": 3.050058207217695, "step": 3930}, {"loss": 1.3885, "grad_norm": 0.7380381226539612, "learning_rate": 0.0002, "epoch": 3.0578191695770274, "step": 3940}, {"loss": 1.4206, "grad_norm": 0.6607900857925415, "learning_rate": 0.0002, "epoch": 3.0655801319363603, "step": 3950}, {"loss": 1.4293, "grad_norm": 0.735263466835022, "learning_rate": 0.0002, "epoch": 3.0733410942956927, "step": 3960}, {"loss": 1.3966, "grad_norm": 0.6788513660430908, "learning_rate": 0.0002, "epoch": 3.081102056655025, "step": 3970}, {"loss": 1.3435, "grad_norm": 0.6347652673721313, "learning_rate": 0.0002, "epoch": 3.088863019014358, "step": 3980}, {"loss": 1.4518, "grad_norm": 0.7056642770767212, "learning_rate": 0.0002, "epoch": 3.0966239813736904, "step": 3990}, {"loss": 1.4474, "grad_norm": 0.6387075185775757, "learning_rate": 0.0002, "epoch": 3.104384943733023, "step": 4000}, {"loss": 1.3833, "grad_norm": 0.6701116561889648, "learning_rate": 0.0002, "epoch": 3.1121459060923553, "step": 4010}, {"loss": 1.404, "grad_norm": 0.7558449506759644, "learning_rate": 0.0002, "epoch": 3.119906868451688, "step": 4020}, {"loss": 1.3294, "grad_norm": 0.6612881422042847, "learning_rate": 0.0002, "epoch": 3.1276678308110206, "step": 4030}, {"loss": 1.439, "grad_norm": 0.7474587559700012, "learning_rate": 0.0002, "epoch": 3.135428793170353, "step": 4040}, {"loss": 1.4616, "grad_norm": 0.7292373776435852, "learning_rate": 0.0002, "epoch": 3.1431897555296855, "step": 4050}, {"loss": 1.3908, "grad_norm": 0.7432886958122253, "learning_rate": 0.0002, "epoch": 3.1509507178890184, "step": 4060}, {"loss": 1.4214, "grad_norm": 0.6366098523139954, "learning_rate": 0.0002, "epoch": 3.158711680248351, "step": 4070}, {"loss": 1.5044, "grad_norm": 0.6837611794471741, "learning_rate": 0.0002, "epoch": 3.1664726426076832, "step": 4080}, {"loss": 1.4332, "grad_norm": 0.7194393277168274, "learning_rate": 0.0002, "epoch": 3.174233604967016, "step": 4090}, {"loss": 1.3628, "grad_norm": 0.6963607668876648, "learning_rate": 0.0002, "epoch": 3.1819945673263486, "step": 4100}, {"loss": 1.4127, "grad_norm": 0.6404902935028076, "learning_rate": 0.0002, "epoch": 3.189755529685681, "step": 4110}, {"loss": 1.4394, "grad_norm": 0.7172070741653442, "learning_rate": 0.0002, "epoch": 3.1975164920450134, "step": 4120}, {"loss": 1.4658, "grad_norm": 0.6577759385108948, "learning_rate": 0.0002, "epoch": 3.2052774544043463, "step": 4130}, {"loss": 1.4019, "grad_norm": 0.6658480167388916, "learning_rate": 0.0002, "epoch": 3.2130384167636787, "step": 4140}, {"loss": 1.4348, "grad_norm": 0.6771699786186218, "learning_rate": 0.0002, "epoch": 3.220799379123011, "step": 4150}, {"loss": 1.4736, "grad_norm": 0.699035108089447, "learning_rate": 0.0002, "epoch": 3.2285603414823436, "step": 4160}, {"loss": 1.4096, "grad_norm": 0.7218514680862427, "learning_rate": 0.0002, "epoch": 3.2363213038416765, "step": 4170}, {"loss": 1.3637, "grad_norm": 0.6270631551742554, "learning_rate": 0.0002, "epoch": 3.244082266201009, "step": 4180}, {"loss": 1.4076, "grad_norm": 0.6828921437263489, "learning_rate": 0.0002, "epoch": 3.2518432285603414, "step": 4190}, {"loss": 1.4663, "grad_norm": 0.6005498170852661, "learning_rate": 0.0002, "epoch": 3.2596041909196742, "step": 4200}, {"loss": 1.4798, "grad_norm": 0.6974790692329407, "learning_rate": 0.0002, "epoch": 3.2673651532790067, "step": 4210}, {"loss": 1.5012, "grad_norm": 0.7269543409347534, "learning_rate": 0.0002, "epoch": 3.275126115638339, "step": 4220}, {"loss": 1.3848, "grad_norm": 0.6728787422180176, "learning_rate": 0.0002, "epoch": 3.2828870779976715, "step": 4230}, {"loss": 1.4112, "grad_norm": 0.676972508430481, "learning_rate": 0.0002, "epoch": 3.2906480403570044, "step": 4240}, {"loss": 1.4206, "grad_norm": 0.748309314250946, "learning_rate": 0.0002, "epoch": 3.298409002716337, "step": 4250}, {"loss": 1.4973, "grad_norm": 0.6976589560508728, "learning_rate": 0.0002, "epoch": 3.3061699650756693, "step": 4260}, {"loss": 1.3967, "grad_norm": 0.649780809879303, "learning_rate": 0.0002, "epoch": 3.3139309274350017, "step": 4270}, {"loss": 1.327, "grad_norm": 0.6529902815818787, "learning_rate": 0.0002, "epoch": 3.3216918897943346, "step": 4280}, {"loss": 1.4888, "grad_norm": 0.9273163676261902, "learning_rate": 0.0002, "epoch": 3.329452852153667, "step": 4290}, {"loss": 1.4859, "grad_norm": 0.717024028301239, "learning_rate": 0.0002, "epoch": 3.3372138145129995, "step": 4300}, {"loss": 1.4441, "grad_norm": 0.7914950251579285, "learning_rate": 0.0002, "epoch": 3.3449747768723324, "step": 4310}, {"loss": 1.432, "grad_norm": 0.7133203148841858, "learning_rate": 0.0002, "epoch": 3.352735739231665, "step": 4320}, {"loss": 1.4662, "grad_norm": 0.7409568428993225, "learning_rate": 0.0002, "epoch": 3.3604967015909972, "step": 4330}, {"loss": 1.3992, "grad_norm": 0.6993981003761292, "learning_rate": 0.0002, "epoch": 3.3682576639503297, "step": 4340}, {"loss": 1.4261, "grad_norm": 0.7114535570144653, "learning_rate": 0.0002, "epoch": 3.3760186263096625, "step": 4350}, {"loss": 1.4227, "grad_norm": 0.6790860295295715, "learning_rate": 0.0002, "epoch": 3.383779588668995, "step": 4360}, {"loss": 1.4128, "grad_norm": 0.6507849097251892, "learning_rate": 0.0002, "epoch": 3.3915405510283274, "step": 4370}, {"loss": 1.4559, "grad_norm": 0.5967804193496704, "learning_rate": 0.0002, "epoch": 3.39930151338766, "step": 4380}, {"loss": 1.3687, "grad_norm": 0.6625847816467285, "learning_rate": 0.0002, "epoch": 3.4070624757469927, "step": 4390}, {"loss": 1.4193, "grad_norm": 0.6736508011817932, "learning_rate": 0.0002, "epoch": 3.414823438106325, "step": 4400}, {"loss": 1.4363, "grad_norm": 0.7870860695838928, "learning_rate": 0.0002, "epoch": 3.4225844004656576, "step": 4410}, {"loss": 1.4114, "grad_norm": 0.7205295562744141, "learning_rate": 0.0002, "epoch": 3.4303453628249905, "step": 4420}, {"loss": 1.4131, "grad_norm": 0.6634634137153625, "learning_rate": 0.0002, "epoch": 3.438106325184323, "step": 4430}, {"loss": 1.4683, "grad_norm": 0.7562733292579651, "learning_rate": 0.0002, "epoch": 3.4458672875436553, "step": 4440}, {"loss": 1.3486, "grad_norm": 0.6585879921913147, "learning_rate": 0.0002, "epoch": 3.453628249902988, "step": 4450}, {"loss": 1.4283, "grad_norm": 0.6896792054176331, "learning_rate": 0.0002, "epoch": 3.4613892122623207, "step": 4460}, {"loss": 1.4208, "grad_norm": 0.6520342230796814, "learning_rate": 0.0002, "epoch": 3.469150174621653, "step": 4470}, {"loss": 1.3423, "grad_norm": 0.6760806441307068, "learning_rate": 0.0002, "epoch": 3.4769111369809855, "step": 4480}, {"loss": 1.4398, "grad_norm": 0.7539774179458618, "learning_rate": 0.0002, "epoch": 3.484672099340318, "step": 4490}, {"loss": 1.4534, "grad_norm": 0.7409411668777466, "learning_rate": 0.0002, "epoch": 3.492433061699651, "step": 4500}, {"loss": 1.4069, "grad_norm": 0.6876253485679626, "learning_rate": 0.0002, "epoch": 3.5001940240589833, "step": 4510}, {"loss": 1.4228, "grad_norm": 0.7028461694717407, "learning_rate": 0.0002, "epoch": 3.5079549864183157, "step": 4520}, {"loss": 1.4723, "grad_norm": 0.8056529760360718, "learning_rate": 0.0002, "epoch": 3.5157159487776486, "step": 4530}, {"loss": 1.4148, "grad_norm": 0.711338996887207, "learning_rate": 0.0002, "epoch": 3.523476911136981, "step": 4540}, {"loss": 1.5247, "grad_norm": 0.7343552708625793, "learning_rate": 0.0002, "epoch": 3.5312378734963135, "step": 4550}, {"loss": 1.4308, "grad_norm": 0.745479941368103, "learning_rate": 0.0002, "epoch": 3.5389988358556463, "step": 4560}, {"loss": 1.4229, "grad_norm": 0.7582294940948486, "learning_rate": 0.0002, "epoch": 3.5467597982149788, "step": 4570}, {"loss": 1.4127, "grad_norm": 0.6717444658279419, "learning_rate": 0.0002, "epoch": 3.554520760574311, "step": 4580}, {"loss": 1.4368, "grad_norm": 0.7417883276939392, "learning_rate": 0.0002, "epoch": 3.5622817229336436, "step": 4590}, {"loss": 1.4176, "grad_norm": 0.6385737061500549, "learning_rate": 0.0002, "epoch": 3.570042685292976, "step": 4600}, {"loss": 1.3981, "grad_norm": 0.716704249382019, "learning_rate": 0.0002, "epoch": 3.577803647652309, "step": 4610}, {"loss": 1.3889, "grad_norm": 0.6948980093002319, "learning_rate": 0.0002, "epoch": 3.5855646100116414, "step": 4620}, {"loss": 1.5177, "grad_norm": 0.6961140036582947, "learning_rate": 0.0002, "epoch": 3.593325572370974, "step": 4630}, {"loss": 1.4508, "grad_norm": 0.7493122220039368, "learning_rate": 0.0002, "epoch": 3.6010865347303067, "step": 4640}, {"loss": 1.3987, "grad_norm": 0.7431658506393433, "learning_rate": 0.0002, "epoch": 3.608847497089639, "step": 4650}, {"loss": 1.4551, "grad_norm": 0.8353387713432312, "learning_rate": 0.0002, "epoch": 3.6166084594489716, "step": 4660}, {"loss": 1.4533, "grad_norm": 0.7095612287521362, "learning_rate": 0.0002, "epoch": 3.6243694218083045, "step": 4670}, {"loss": 1.4003, "grad_norm": 0.776620090007782, "learning_rate": 0.0002, "epoch": 3.632130384167637, "step": 4680}, {"loss": 1.4361, "grad_norm": 0.7198925018310547, "learning_rate": 0.0002, "epoch": 3.6398913465269693, "step": 4690}, {"loss": 1.4543, "grad_norm": 0.8238834738731384, "learning_rate": 0.0002, "epoch": 3.6476523088863018, "step": 4700}, {"loss": 1.3958, "grad_norm": 0.6804245710372925, "learning_rate": 0.0002, "epoch": 3.655413271245634, "step": 4710}, {"loss": 1.4158, "grad_norm": 0.8444845676422119, "learning_rate": 0.0002, "epoch": 3.663174233604967, "step": 4720}, {"loss": 1.3825, "grad_norm": 0.743797779083252, "learning_rate": 0.0002, "epoch": 3.6709351959642995, "step": 4730}, {"loss": 1.4213, "grad_norm": 0.8994188904762268, "learning_rate": 0.0002, "epoch": 3.678696158323632, "step": 4740}, {"loss": 1.4281, "grad_norm": 0.75416100025177, "learning_rate": 0.0002, "epoch": 3.686457120682965, "step": 4750}, {"loss": 1.4154, "grad_norm": 0.6499266028404236, "learning_rate": 0.0002, "epoch": 3.6942180830422973, "step": 4760}, {"loss": 1.4005, "grad_norm": 0.7246791124343872, "learning_rate": 0.0002, "epoch": 3.7019790454016297, "step": 4770}, {"loss": 1.426, "grad_norm": 0.7831124067306519, "learning_rate": 0.0002, "epoch": 3.7097400077609626, "step": 4780}, {"loss": 1.3933, "grad_norm": 0.7130028009414673, "learning_rate": 0.0002, "epoch": 3.717500970120295, "step": 4790}, {"loss": 1.4632, "grad_norm": 0.7501602172851562, "learning_rate": 0.0002, "epoch": 3.7252619324796274, "step": 4800}, {"loss": 1.4985, "grad_norm": 0.6980932950973511, "learning_rate": 0.0002, "epoch": 3.73302289483896, "step": 4810}, {"loss": 1.4517, "grad_norm": 0.8050530552864075, "learning_rate": 0.0002, "epoch": 3.7407838571982923, "step": 4820}, {"loss": 1.4703, "grad_norm": 0.6385579705238342, "learning_rate": 0.0002, "epoch": 3.748544819557625, "step": 4830}, {"loss": 1.5281, "grad_norm": 0.6664714813232422, "learning_rate": 0.0002, "epoch": 3.7563057819169576, "step": 4840}, {"loss": 1.4443, "grad_norm": 0.7125676274299622, "learning_rate": 0.0002, "epoch": 3.76406674427629, "step": 4850}, {"loss": 1.3958, "grad_norm": 0.7231866717338562, "learning_rate": 0.0002, "epoch": 3.771827706635623, "step": 4860}, {"loss": 1.4446, "grad_norm": 0.6917183995246887, "learning_rate": 0.0002, "epoch": 3.7795886689949554, "step": 4870}, {"loss": 1.4369, "grad_norm": 0.665037989616394, "learning_rate": 0.0002, "epoch": 3.787349631354288, "step": 4880}, {"loss": 1.4193, "grad_norm": 0.5837726593017578, "learning_rate": 0.0002, "epoch": 3.7951105937136207, "step": 4890}, {"loss": 1.4176, "grad_norm": 0.6366701722145081, "learning_rate": 0.0002, "epoch": 3.802871556072953, "step": 4900}, {"loss": 1.46, "grad_norm": 0.7082223892211914, "learning_rate": 0.0002, "epoch": 3.8106325184322856, "step": 4910}, {"loss": 1.5139, "grad_norm": 0.8101672530174255, "learning_rate": 0.0002, "epoch": 3.818393480791618, "step": 4920}, {"loss": 1.3659, "grad_norm": 0.7516148090362549, "learning_rate": 0.0002, "epoch": 3.826154443150951, "step": 4930}, {"loss": 1.3909, "grad_norm": 0.7928489446640015, "learning_rate": 0.0002, "epoch": 3.8339154055102833, "step": 4940}, {"loss": 1.4255, "grad_norm": 0.6892234683036804, "learning_rate": 0.0002, "epoch": 3.8416763678696157, "step": 4950}, {"loss": 1.5024, "grad_norm": 0.6381304264068604, "learning_rate": 0.0002, "epoch": 3.849437330228948, "step": 4960}, {"loss": 1.4873, "grad_norm": 0.8068831562995911, "learning_rate": 0.0002, "epoch": 3.857198292588281, "step": 4970}, {"loss": 1.45, "grad_norm": 0.7289869785308838, "learning_rate": 0.0002, "epoch": 3.8649592549476135, "step": 4980}, {"loss": 1.398, "grad_norm": 0.7278549075126648, "learning_rate": 0.0002, "epoch": 3.872720217306946, "step": 4990}, {"loss": 1.4442, "grad_norm": 0.7324236631393433, "learning_rate": 0.0002, "epoch": 3.880481179666279, "step": 5000}, {"loss": 1.4511, "grad_norm": 0.6759871244430542, "learning_rate": 0.0002, "epoch": 3.8882421420256112, "step": 5010}, {"loss": 1.4705, "grad_norm": 0.8159207701683044, "learning_rate": 0.0002, "epoch": 3.8960031043849437, "step": 5020}, {"loss": 1.4685, "grad_norm": 0.6536211967468262, "learning_rate": 0.0002, "epoch": 3.9037640667442766, "step": 5030}, {"loss": 1.4335, "grad_norm": 0.6827932000160217, "learning_rate": 0.0002, "epoch": 3.911525029103609, "step": 5040}, {"loss": 1.433, "grad_norm": 0.6688340306282043, "learning_rate": 0.0002, "epoch": 3.9192859914629414, "step": 5050}, {"loss": 1.4099, "grad_norm": 0.6385695934295654, "learning_rate": 0.0002, "epoch": 3.927046953822274, "step": 5060}, {"loss": 1.4767, "grad_norm": 0.6975107192993164, "learning_rate": 0.0002, "epoch": 3.9348079161816063, "step": 5070}, {"loss": 1.4893, "grad_norm": 0.6684112548828125, "learning_rate": 0.0002, "epoch": 3.942568878540939, "step": 5080}, {"loss": 1.4732, "grad_norm": 0.8349628448486328, "learning_rate": 0.0002, "epoch": 3.9503298409002716, "step": 5090}, {"loss": 1.5131, "grad_norm": 0.7146425843238831, "learning_rate": 0.0002, "epoch": 3.958090803259604, "step": 5100}, {"loss": 1.4149, "grad_norm": 0.6555036902427673, "learning_rate": 0.0002, "epoch": 3.965851765618937, "step": 5110}, {"loss": 1.4274, "grad_norm": 0.7037415504455566, "learning_rate": 0.0002, "epoch": 3.9736127279782694, "step": 5120}, {"loss": 1.4292, "grad_norm": 0.7235575914382935, "learning_rate": 0.0002, "epoch": 3.981373690337602, "step": 5130}, {"loss": 1.4455, "grad_norm": 0.7092325687408447, "learning_rate": 0.0002, "epoch": 3.9891346526969347, "step": 5140}, {"loss": 1.4512, "grad_norm": 0.7490319609642029, "learning_rate": 0.0002, "epoch": 3.996895615056267, "step": 5150}, {"eval_loss": 1.9131355285644531, "eval_runtime": 105.5778, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.606, "epoch": 4.0, "step": 5154}, {"loss": 1.2643, "grad_norm": 0.7075854539871216, "learning_rate": 0.0002, "epoch": 4.0046565774155995, "step": 5160}, {"loss": 1.209, "grad_norm": 0.9466007351875305, "learning_rate": 0.0002, "epoch": 4.012417539774932, "step": 5170}, {"loss": 1.2567, "grad_norm": 1.0297044515609741, "learning_rate": 0.0002, "epoch": 4.020178502134264, "step": 5180}, {"loss": 1.1796, "grad_norm": 0.7765059471130371, "learning_rate": 0.0002, "epoch": 4.027939464493597, "step": 5190}, {"loss": 1.2356, "grad_norm": 0.995760977268219, "learning_rate": 0.0002, "epoch": 4.03570042685293, "step": 5200}, {"loss": 1.1792, "grad_norm": 0.8663829565048218, "learning_rate": 0.0002, "epoch": 4.043461389212262, "step": 5210}, {"loss": 1.2471, "grad_norm": 1.0660825967788696, "learning_rate": 0.0002, "epoch": 4.051222351571595, "step": 5220}, {"loss": 1.1676, "grad_norm": 0.9858174920082092, "learning_rate": 0.0002, "epoch": 4.058983313930927, "step": 5230}, {"loss": 1.2448, "grad_norm": 0.8911338448524475, "learning_rate": 0.0002, "epoch": 4.06674427629026, "step": 5240}, {"loss": 1.1858, "grad_norm": 1.0848394632339478, "learning_rate": 0.0002, "epoch": 4.074505238649593, "step": 5250}, {"loss": 1.1684, "grad_norm": 1.0849905014038086, "learning_rate": 0.0002, "epoch": 4.082266201008925, "step": 5260}, {"loss": 1.2007, "grad_norm": 1.0497841835021973, "learning_rate": 0.0002, "epoch": 4.090027163368258, "step": 5270}, {"loss": 1.2552, "grad_norm": 0.8943053483963013, "learning_rate": 0.0002, "epoch": 4.0977881257275905, "step": 5280}, {"loss": 1.1923, "grad_norm": 0.8432527184486389, "learning_rate": 0.0002, "epoch": 4.1055490880869225, "step": 5290}, {"loss": 1.1634, "grad_norm": 0.9690414667129517, "learning_rate": 0.0002, "epoch": 4.113310050446255, "step": 5300}, {"loss": 1.3019, "grad_norm": 0.7790773510932922, "learning_rate": 0.0002, "epoch": 4.121071012805588, "step": 5310}, {"loss": 1.1806, "grad_norm": 0.9289211630821228, "learning_rate": 0.0002, "epoch": 4.12883197516492, "step": 5320}, {"loss": 1.1458, "grad_norm": 1.0785125494003296, "learning_rate": 0.0002, "epoch": 4.136592937524253, "step": 5330}, {"loss": 1.2086, "grad_norm": 0.8559591770172119, "learning_rate": 0.0002, "epoch": 4.144353899883585, "step": 5340}, {"loss": 1.1974, "grad_norm": 0.9405956268310547, "learning_rate": 0.0002, "epoch": 4.152114862242918, "step": 5350}, {"loss": 1.1793, "grad_norm": 0.9942827820777893, "learning_rate": 0.0002, "epoch": 4.159875824602251, "step": 5360}, {"loss": 1.1659, "grad_norm": 0.9141933917999268, "learning_rate": 0.0002, "epoch": 4.167636786961583, "step": 5370}, {"loss": 1.1647, "grad_norm": 0.8206015229225159, "learning_rate": 0.0002, "epoch": 4.175397749320916, "step": 5380}, {"loss": 1.2778, "grad_norm": 0.9340888857841492, "learning_rate": 0.0002, "epoch": 4.183158711680249, "step": 5390}, {"loss": 1.2459, "grad_norm": 1.2122114896774292, "learning_rate": 0.0002, "epoch": 4.190919674039581, "step": 5400}, {"loss": 1.2371, "grad_norm": 1.0661298036575317, "learning_rate": 0.0002, "epoch": 4.1986806363989135, "step": 5410}, {"loss": 1.1978, "grad_norm": 0.9372861385345459, "learning_rate": 0.0002, "epoch": 4.206441598758246, "step": 5420}, {"loss": 1.2653, "grad_norm": 0.894012987613678, "learning_rate": 0.0002, "epoch": 4.214202561117578, "step": 5430}, {"loss": 1.387, "grad_norm": 1.0647753477096558, "learning_rate": 0.0002, "epoch": 4.221963523476911, "step": 5440}, {"loss": 1.2231, "grad_norm": 0.989179790019989, "learning_rate": 0.0002, "epoch": 4.229724485836243, "step": 5450}, {"loss": 1.2715, "grad_norm": 1.1601181030273438, "learning_rate": 0.0002, "epoch": 4.237485448195576, "step": 5460}, {"loss": 1.2406, "grad_norm": 0.9395585656166077, "learning_rate": 0.0002, "epoch": 4.245246410554909, "step": 5470}, {"loss": 1.2779, "grad_norm": 0.9527766108512878, "learning_rate": 0.0002, "epoch": 4.253007372914241, "step": 5480}, {"loss": 1.267, "grad_norm": 1.0319520235061646, "learning_rate": 0.0002, "epoch": 4.260768335273574, "step": 5490}, {"loss": 1.2633, "grad_norm": 0.8659824728965759, "learning_rate": 0.0002, "epoch": 4.268529297632907, "step": 5500}, {"loss": 1.1475, "grad_norm": 1.099211573600769, "learning_rate": 0.0002, "epoch": 4.276290259992239, "step": 5510}, {"loss": 1.2508, "grad_norm": 0.9363361597061157, "learning_rate": 0.0002, "epoch": 4.284051222351572, "step": 5520}, {"loss": 1.189, "grad_norm": 0.8437647223472595, "learning_rate": 0.0002, "epoch": 4.2918121847109045, "step": 5530}, {"loss": 1.2212, "grad_norm": 0.9181258678436279, "learning_rate": 0.0002, "epoch": 4.2995731470702365, "step": 5540}, {"loss": 1.2092, "grad_norm": 0.9059357643127441, "learning_rate": 0.0002, "epoch": 4.307334109429569, "step": 5550}, {"loss": 1.2189, "grad_norm": 0.9337241649627686, "learning_rate": 0.0002, "epoch": 4.315095071788901, "step": 5560}, {"loss": 1.2462, "grad_norm": 0.9428889155387878, "learning_rate": 0.0002, "epoch": 4.322856034148234, "step": 5570}, {"loss": 1.2675, "grad_norm": 1.003589153289795, "learning_rate": 0.0002, "epoch": 4.330616996507567, "step": 5580}, {"loss": 1.2703, "grad_norm": 1.1249268054962158, "learning_rate": 0.0002, "epoch": 4.338377958866899, "step": 5590}, {"loss": 1.2501, "grad_norm": 0.8623469471931458, "learning_rate": 0.0002, "epoch": 4.346138921226232, "step": 5600}, {"loss": 1.2404, "grad_norm": 1.1389174461364746, "learning_rate": 0.0002, "epoch": 4.353899883585565, "step": 5610}, {"loss": 1.2245, "grad_norm": 1.0136264562606812, "learning_rate": 0.0002, "epoch": 4.361660845944897, "step": 5620}, {"loss": 1.3473, "grad_norm": 0.9567070603370667, "learning_rate": 0.0002, "epoch": 4.36942180830423, "step": 5630}, {"loss": 1.2988, "grad_norm": 1.0592148303985596, "learning_rate": 0.0002, "epoch": 4.377182770663563, "step": 5640}, {"loss": 1.212, "grad_norm": 1.0110485553741455, "learning_rate": 0.0002, "epoch": 4.384943733022895, "step": 5650}, {"loss": 1.2086, "grad_norm": 0.9914907217025757, "learning_rate": 0.0002, "epoch": 4.3927046953822275, "step": 5660}, {"loss": 1.2363, "grad_norm": 0.9447247982025146, "learning_rate": 0.0002, "epoch": 4.4004656577415595, "step": 5670}, {"loss": 1.2617, "grad_norm": 0.9644378423690796, "learning_rate": 0.0002, "epoch": 4.408226620100892, "step": 5680}, {"loss": 1.2773, "grad_norm": 0.920676589012146, "learning_rate": 0.0002, "epoch": 4.415987582460225, "step": 5690}, {"loss": 1.2792, "grad_norm": 1.060570478439331, "learning_rate": 0.0002, "epoch": 4.423748544819557, "step": 5700}, {"loss": 1.2374, "grad_norm": 0.8857738971710205, "learning_rate": 0.0002, "epoch": 4.43150950717889, "step": 5710}, {"loss": 1.2588, "grad_norm": 1.0536398887634277, "learning_rate": 0.0002, "epoch": 4.439270469538223, "step": 5720}, {"loss": 1.2051, "grad_norm": 0.990847110748291, "learning_rate": 0.0002, "epoch": 4.447031431897555, "step": 5730}, {"loss": 1.2469, "grad_norm": 0.9692499041557312, "learning_rate": 0.0002, "epoch": 4.454792394256888, "step": 5740}, {"loss": 1.2269, "grad_norm": 1.0376402139663696, "learning_rate": 0.0002, "epoch": 4.462553356616221, "step": 5750}, {"loss": 1.1701, "grad_norm": 1.3863259553909302, "learning_rate": 0.0002, "epoch": 4.470314318975553, "step": 5760}, {"loss": 1.2591, "grad_norm": 0.978379487991333, "learning_rate": 0.0002, "epoch": 4.478075281334886, "step": 5770}, {"loss": 1.2729, "grad_norm": 1.0973085165023804, "learning_rate": 0.0002, "epoch": 4.485836243694218, "step": 5780}, {"loss": 1.2404, "grad_norm": 1.057006597518921, "learning_rate": 0.0002, "epoch": 4.4935972060535505, "step": 5790}, {"loss": 1.2476, "grad_norm": 0.9247729182243347, "learning_rate": 0.0002, "epoch": 4.501358168412883, "step": 5800}, {"loss": 1.2369, "grad_norm": 1.0447787046432495, "learning_rate": 0.0002, "epoch": 4.509119130772215, "step": 5810}, {"loss": 1.211, "grad_norm": 1.1930429935455322, "learning_rate": 0.0002, "epoch": 4.516880093131548, "step": 5820}, {"loss": 1.2596, "grad_norm": 0.9867590069770813, "learning_rate": 0.0002, "epoch": 4.524641055490881, "step": 5830}, {"loss": 1.2766, "grad_norm": 0.9591100215911865, "learning_rate": 0.0002, "epoch": 4.532402017850213, "step": 5840}, {"loss": 1.2154, "grad_norm": 0.9950753450393677, "learning_rate": 0.0002, "epoch": 4.540162980209546, "step": 5850}, {"loss": 1.2149, "grad_norm": 1.0087506771087646, "learning_rate": 0.0002, "epoch": 4.547923942568879, "step": 5860}, {"loss": 1.3165, "grad_norm": 1.0934417247772217, "learning_rate": 0.0002, "epoch": 4.555684904928211, "step": 5870}, {"loss": 1.3059, "grad_norm": 1.107987403869629, "learning_rate": 0.0002, "epoch": 4.563445867287544, "step": 5880}, {"loss": 1.2184, "grad_norm": 0.9147276878356934, "learning_rate": 0.0002, "epoch": 4.571206829646876, "step": 5890}, {"loss": 1.24, "grad_norm": 1.036780595779419, "learning_rate": 0.0002, "epoch": 4.578967792006209, "step": 5900}, {"loss": 1.2209, "grad_norm": 0.9284719824790955, "learning_rate": 0.0002, "epoch": 4.5867287543655415, "step": 5910}, {"loss": 1.3693, "grad_norm": 0.9141898155212402, "learning_rate": 0.0002, "epoch": 4.5944897167248735, "step": 5920}, {"loss": 1.2319, "grad_norm": 1.0447357892990112, "learning_rate": 0.0002, "epoch": 4.602250679084206, "step": 5930}, {"loss": 1.2667, "grad_norm": 0.9309114217758179, "learning_rate": 0.0002, "epoch": 4.610011641443539, "step": 5940}, {"loss": 1.2827, "grad_norm": 1.2986129522323608, "learning_rate": 0.0002, "epoch": 4.617772603802871, "step": 5950}, {"loss": 1.312, "grad_norm": 0.9221704602241516, "learning_rate": 0.0002, "epoch": 4.625533566162204, "step": 5960}, {"loss": 1.2769, "grad_norm": 0.9228187799453735, "learning_rate": 0.0002, "epoch": 4.633294528521537, "step": 5970}, {"loss": 1.2953, "grad_norm": 0.9483116269111633, "learning_rate": 0.0002, "epoch": 4.641055490880869, "step": 5980}, {"loss": 1.3437, "grad_norm": 1.0218974351882935, "learning_rate": 0.0002, "epoch": 4.648816453240202, "step": 5990}, {"loss": 1.3085, "grad_norm": 0.9764600396156311, "learning_rate": 0.0002, "epoch": 4.656577415599534, "step": 6000}, {"loss": 1.197, "grad_norm": 0.9115710258483887, "learning_rate": 0.0002, "epoch": 4.664338377958867, "step": 6010}, {"loss": 1.1917, "grad_norm": 0.9245651364326477, "learning_rate": 0.0002, "epoch": 4.6720993403182, "step": 6020}, {"loss": 1.2969, "grad_norm": 0.9686311483383179, "learning_rate": 0.0002, "epoch": 4.6798603026775325, "step": 6030}, {"loss": 1.2702, "grad_norm": 1.1807392835617065, "learning_rate": 0.0002, "epoch": 4.6876212650368645, "step": 6040}, {"loss": 1.328, "grad_norm": 1.0358641147613525, "learning_rate": 0.0002, "epoch": 4.695382227396197, "step": 6050}, {"loss": 1.3281, "grad_norm": 0.987332284450531, "learning_rate": 0.0002, "epoch": 4.703143189755529, "step": 6060}, {"loss": 1.2514, "grad_norm": 1.0526494979858398, "learning_rate": 0.0002, "epoch": 4.710904152114862, "step": 6070}, {"loss": 1.2246, "grad_norm": 1.0276758670806885, "learning_rate": 0.0002, "epoch": 4.718665114474195, "step": 6080}, {"loss": 1.3367, "grad_norm": 0.9904406666755676, "learning_rate": 0.0002, "epoch": 4.726426076833527, "step": 6090}, {"loss": 1.2797, "grad_norm": 1.0084882974624634, "learning_rate": 0.0002, "epoch": 4.73418703919286, "step": 6100}, {"loss": 1.2656, "grad_norm": 0.8646450638771057, "learning_rate": 0.0002, "epoch": 4.741948001552192, "step": 6110}, {"loss": 1.3063, "grad_norm": 0.9233377575874329, "learning_rate": 0.0002, "epoch": 4.749708963911525, "step": 6120}, {"loss": 1.2642, "grad_norm": 0.9675140976905823, "learning_rate": 0.0002, "epoch": 4.757469926270858, "step": 6130}, {"loss": 1.3367, "grad_norm": 0.9639796018600464, "learning_rate": 0.0002, "epoch": 4.765230888630191, "step": 6140}, {"loss": 1.276, "grad_norm": 0.925199568271637, "learning_rate": 0.0002, "epoch": 4.772991850989523, "step": 6150}, {"loss": 1.2441, "grad_norm": 1.050901174545288, "learning_rate": 0.0002, "epoch": 4.7807528133488555, "step": 6160}, {"loss": 1.301, "grad_norm": 0.8920623660087585, "learning_rate": 0.0002, "epoch": 4.7885137757081875, "step": 6170}, {"loss": 1.263, "grad_norm": 0.8964757919311523, "learning_rate": 0.0002, "epoch": 4.79627473806752, "step": 6180}, {"loss": 1.2787, "grad_norm": 1.0839070081710815, "learning_rate": 0.0002, "epoch": 4.804035700426853, "step": 6190}, {"loss": 1.2664, "grad_norm": 0.8809942007064819, "learning_rate": 0.0002, "epoch": 4.811796662786185, "step": 6200}, {"loss": 1.321, "grad_norm": 1.0216195583343506, "learning_rate": 0.0002, "epoch": 4.819557625145518, "step": 6210}, {"loss": 1.3033, "grad_norm": 0.892005980014801, "learning_rate": 0.0002, "epoch": 4.827318587504851, "step": 6220}, {"loss": 1.2602, "grad_norm": 0.9957166910171509, "learning_rate": 0.0002, "epoch": 4.835079549864183, "step": 6230}, {"loss": 1.3562, "grad_norm": 0.9720533490180969, "learning_rate": 0.0002, "epoch": 4.842840512223516, "step": 6240}, {"loss": 1.2651, "grad_norm": 0.9336182475090027, "learning_rate": 0.0002, "epoch": 4.850601474582849, "step": 6250}, {"loss": 1.3136, "grad_norm": 1.2611457109451294, "learning_rate": 0.0002, "epoch": 4.858362436942181, "step": 6260}, {"loss": 1.2234, "grad_norm": 0.8927203416824341, "learning_rate": 0.0002, "epoch": 4.866123399301514, "step": 6270}, {"loss": 1.3463, "grad_norm": 0.9706710577011108, "learning_rate": 0.0002, "epoch": 4.873884361660846, "step": 6280}, {"loss": 1.3209, "grad_norm": 1.1461690664291382, "learning_rate": 0.0002, "epoch": 4.8816453240201785, "step": 6290}, {"loss": 1.2566, "grad_norm": 0.9930381178855896, "learning_rate": 0.0002, "epoch": 4.889406286379511, "step": 6300}, {"loss": 1.2568, "grad_norm": 0.91451096534729, "learning_rate": 0.0002, "epoch": 4.897167248738843, "step": 6310}, {"loss": 1.2836, "grad_norm": 1.0319571495056152, "learning_rate": 0.0002, "epoch": 4.904928211098176, "step": 6320}, {"loss": 1.2908, "grad_norm": 0.990140438079834, "learning_rate": 0.0002, "epoch": 4.912689173457509, "step": 6330}, {"loss": 1.3299, "grad_norm": 1.2466117143630981, "learning_rate": 0.0002, "epoch": 4.920450135816841, "step": 6340}, {"loss": 1.2659, "grad_norm": 1.0316979885101318, "learning_rate": 0.0002, "epoch": 4.928211098176174, "step": 6350}, {"loss": 1.3292, "grad_norm": 1.0643759965896606, "learning_rate": 0.0002, "epoch": 4.935972060535507, "step": 6360}, {"loss": 1.2559, "grad_norm": 0.9703279733657837, "learning_rate": 0.0002, "epoch": 4.943733022894839, "step": 6370}, {"loss": 1.2155, "grad_norm": 0.9767927527427673, "learning_rate": 0.0002, "epoch": 4.951493985254172, "step": 6380}, {"loss": 1.2437, "grad_norm": 0.960854172706604, "learning_rate": 0.0002, "epoch": 4.959254947613504, "step": 6390}, {"loss": 1.3314, "grad_norm": 0.9922910332679749, "learning_rate": 0.0002, "epoch": 4.967015909972837, "step": 6400}, {"loss": 1.3018, "grad_norm": 0.956470787525177, "learning_rate": 0.0002, "epoch": 4.9747768723321695, "step": 6410}, {"loss": 1.2794, "grad_norm": 0.9637242555618286, "learning_rate": 0.0002, "epoch": 4.9825378346915015, "step": 6420}, {"loss": 1.3236, "grad_norm": 1.0855202674865723, "learning_rate": 0.0002, "epoch": 4.990298797050834, "step": 6430}, {"loss": 1.3015, "grad_norm": 0.9655316472053528, "learning_rate": 0.0002, "epoch": 4.998059759410167, "step": 6440}, {"eval_loss": 2.0410802364349365, "eval_runtime": 113.04, "eval_samples_per_second": 4.485, "eval_steps_per_second": 0.566, "epoch": 4.9996119518820334, "step": 6442}, {"loss": 1.0846, "grad_norm": 1.1676199436187744, "learning_rate": 0.0002, "epoch": 5.005820721769499, "step": 6450}, {"loss": 1.041, "grad_norm": 1.4317965507507324, "learning_rate": 0.0002, "epoch": 5.013581684128832, "step": 6460}, {"loss": 0.9546, "grad_norm": 1.460443377494812, "learning_rate": 0.0002, "epoch": 5.021342646488165, "step": 6470}, {"loss": 1.0014, "grad_norm": 1.2299214601516724, "learning_rate": 0.0002, "epoch": 5.029103608847497, "step": 6480}, {"loss": 1.0397, "grad_norm": 1.3125724792480469, "learning_rate": 0.0002, "epoch": 5.03686457120683, "step": 6490}, {"loss": 1.0134, "grad_norm": 1.1252319812774658, "learning_rate": 0.0002, "epoch": 5.044625533566162, "step": 6500}, {"loss": 0.976, "grad_norm": 0.9970866441726685, "learning_rate": 0.0002, "epoch": 5.052386495925495, "step": 6510}, {"loss": 0.9731, "grad_norm": 1.229069709777832, "learning_rate": 0.0002, "epoch": 5.060147458284828, "step": 6520}, {"loss": 1.0498, "grad_norm": 1.2430938482284546, "learning_rate": 0.0002, "epoch": 5.06790842064416, "step": 6530}, {"loss": 1.0236, "grad_norm": 1.0522737503051758, "learning_rate": 0.0002, "epoch": 5.0756693830034925, "step": 6540}, {"loss": 1.0221, "grad_norm": 1.108890175819397, "learning_rate": 0.0002, "epoch": 5.083430345362825, "step": 6550}, {"loss": 1.0177, "grad_norm": 1.156912922859192, "learning_rate": 0.0002, "epoch": 5.091191307722157, "step": 6560}, {"loss": 1.0415, "grad_norm": 1.405895709991455, "learning_rate": 0.0002, "epoch": 5.09895227008149, "step": 6570}, {"loss": 0.9811, "grad_norm": 1.2005155086517334, "learning_rate": 0.0002, "epoch": 5.106713232440823, "step": 6580}, {"loss": 0.9862, "grad_norm": 1.181443452835083, "learning_rate": 0.0002, "epoch": 5.114474194800155, "step": 6590}, {"loss": 1.0291, "grad_norm": 2.3444771766662598, "learning_rate": 0.0002, "epoch": 5.122235157159488, "step": 6600}, {"loss": 1.0455, "grad_norm": 1.216988444328308, "learning_rate": 0.0002, "epoch": 5.12999611951882, "step": 6610}, {"loss": 1.0549, "grad_norm": 1.369553565979004, "learning_rate": 0.0002, "epoch": 5.137757081878153, "step": 6620}, {"loss": 1.0056, "grad_norm": 1.177964687347412, "learning_rate": 0.0002, "epoch": 5.145518044237486, "step": 6630}, {"loss": 1.1025, "grad_norm": 1.1397041082382202, "learning_rate": 0.0002, "epoch": 5.153279006596818, "step": 6640}, {"loss": 1.0437, "grad_norm": 1.3976861238479614, "learning_rate": 0.0002, "epoch": 5.161039968956151, "step": 6650}, {"loss": 1.0454, "grad_norm": 1.4824495315551758, "learning_rate": 0.0002, "epoch": 5.1688009313154835, "step": 6660}, {"loss": 1.0356, "grad_norm": 1.2653018236160278, "learning_rate": 0.0002, "epoch": 5.1765618936748155, "step": 6670}, {"loss": 0.9971, "grad_norm": 1.3106069564819336, "learning_rate": 0.0002, "epoch": 5.184322856034148, "step": 6680}, {"loss": 1.0561, "grad_norm": 1.3140279054641724, "learning_rate": 0.0002, "epoch": 5.192083818393481, "step": 6690}, {"loss": 1.0618, "grad_norm": 1.3900256156921387, "learning_rate": 0.0002, "epoch": 5.199844780752813, "step": 6700}, {"loss": 1.0285, "grad_norm": 1.3191124200820923, "learning_rate": 0.0002, "epoch": 5.207605743112146, "step": 6710}, {"loss": 0.9921, "grad_norm": 1.176107406616211, "learning_rate": 0.0002, "epoch": 5.215366705471478, "step": 6720}, {"loss": 1.064, "grad_norm": 1.2364883422851562, "learning_rate": 0.0002, "epoch": 5.223127667830811, "step": 6730}, {"loss": 0.9599, "grad_norm": 1.343022108078003, "learning_rate": 0.0002, "epoch": 5.230888630190144, "step": 6740}, {"loss": 1.0342, "grad_norm": 1.2826898097991943, "learning_rate": 0.0002, "epoch": 5.238649592549476, "step": 6750}, {"loss": 1.0703, "grad_norm": 1.500257134437561, "learning_rate": 0.0002, "epoch": 5.246410554908809, "step": 6760}, {"loss": 1.0114, "grad_norm": 1.2605743408203125, "learning_rate": 0.0002, "epoch": 5.254171517268142, "step": 6770}, {"loss": 1.0825, "grad_norm": 1.2355525493621826, "learning_rate": 0.0002, "epoch": 5.261932479627474, "step": 6780}, {"loss": 1.0436, "grad_norm": 1.2845789194107056, "learning_rate": 0.0002, "epoch": 5.2696934419868064, "step": 6790}, {"loss": 0.989, "grad_norm": 1.3696625232696533, "learning_rate": 0.0002, "epoch": 5.277454404346139, "step": 6800}, {"loss": 1.0991, "grad_norm": 1.4051260948181152, "learning_rate": 0.0002, "epoch": 5.285215366705471, "step": 6810}, {"loss": 1.0987, "grad_norm": 1.266725778579712, "learning_rate": 0.0002, "epoch": 5.292976329064804, "step": 6820}, {"loss": 1.0489, "grad_norm": 1.3475236892700195, "learning_rate": 0.0002, "epoch": 5.300737291424136, "step": 6830}, {"loss": 1.0264, "grad_norm": 1.54409921169281, "learning_rate": 0.0002, "epoch": 5.308498253783469, "step": 6840}, {"loss": 1.033, "grad_norm": 1.2391985654830933, "learning_rate": 0.0002, "epoch": 5.316259216142802, "step": 6850}, {"loss": 1.1058, "grad_norm": 1.2435699701309204, "learning_rate": 0.0002, "epoch": 5.324020178502134, "step": 6860}, {"loss": 1.0179, "grad_norm": 1.8803037405014038, "learning_rate": 0.0002, "epoch": 5.331781140861467, "step": 6870}, {"loss": 0.997, "grad_norm": 1.4195542335510254, "learning_rate": 0.0002, "epoch": 5.3395421032208, "step": 6880}, {"loss": 1.0273, "grad_norm": 1.1853394508361816, "learning_rate": 0.0002, "epoch": 5.347303065580132, "step": 6890}, {"loss": 1.0668, "grad_norm": 1.4016530513763428, "learning_rate": 0.0002, "epoch": 5.355064027939465, "step": 6900}, {"loss": 1.1099, "grad_norm": 1.294339895248413, "learning_rate": 0.0002, "epoch": 5.3628249902987974, "step": 6910}, {"loss": 1.0724, "grad_norm": 1.2952708005905151, "learning_rate": 0.0002, "epoch": 5.370585952658129, "step": 6920}, {"loss": 1.0098, "grad_norm": 1.1361510753631592, "learning_rate": 0.0002, "epoch": 5.378346915017462, "step": 6930}, {"loss": 1.0796, "grad_norm": 1.125805377960205, "learning_rate": 0.0002, "epoch": 5.386107877376794, "step": 6940}, {"loss": 1.122, "grad_norm": 1.1453300714492798, "learning_rate": 0.0002, "epoch": 5.393868839736127, "step": 6950}, {"loss": 1.0977, "grad_norm": 1.4542768001556396, "learning_rate": 0.0002, "epoch": 5.40162980209546, "step": 6960}, {"loss": 1.0825, "grad_norm": 1.2360988855361938, "learning_rate": 0.0002, "epoch": 5.409390764454792, "step": 6970}, {"loss": 1.0631, "grad_norm": 1.2182754278182983, "learning_rate": 0.0002, "epoch": 5.417151726814125, "step": 6980}, {"loss": 1.0471, "grad_norm": 1.2018693685531616, "learning_rate": 0.0002, "epoch": 5.424912689173458, "step": 6990}, {"loss": 1.108, "grad_norm": 1.346124291419983, "learning_rate": 0.0002, "epoch": 5.43267365153279, "step": 7000}, {"loss": 1.0534, "grad_norm": 1.2534189224243164, "learning_rate": 0.0002, "epoch": 5.440434613892123, "step": 7010}, {"loss": 1.0696, "grad_norm": 1.2033339738845825, "learning_rate": 0.0002, "epoch": 5.448195576251456, "step": 7020}, {"loss": 1.0714, "grad_norm": 1.2788134813308716, "learning_rate": 0.0002, "epoch": 5.4559565386107876, "step": 7030}, {"loss": 1.1274, "grad_norm": 1.2751542329788208, "learning_rate": 0.0002, "epoch": 5.46371750097012, "step": 7040}, {"loss": 1.0767, "grad_norm": 1.3237019777297974, "learning_rate": 0.0002, "epoch": 5.471478463329452, "step": 7050}, {"loss": 1.1081, "grad_norm": 1.4932852983474731, "learning_rate": 0.0002, "epoch": 5.479239425688785, "step": 7060}, {"loss": 1.0197, "grad_norm": 1.4003876447677612, "learning_rate": 0.0002, "epoch": 5.487000388048118, "step": 7070}, {"loss": 1.0662, "grad_norm": 1.404799461364746, "learning_rate": 0.0002, "epoch": 5.49476135040745, "step": 7080}, {"loss": 1.0354, "grad_norm": 1.4486982822418213, "learning_rate": 0.0002, "epoch": 5.502522312766783, "step": 7090}, {"loss": 1.0645, "grad_norm": 1.1713480949401855, "learning_rate": 0.0002, "epoch": 5.510283275126116, "step": 7100}, {"loss": 1.006, "grad_norm": 1.4062601327896118, "learning_rate": 0.0002, "epoch": 5.518044237485448, "step": 7110}, {"loss": 1.0459, "grad_norm": 1.211629867553711, "learning_rate": 0.0002, "epoch": 5.525805199844781, "step": 7120}, {"loss": 1.102, "grad_norm": 1.2523176670074463, "learning_rate": 0.0002, "epoch": 5.533566162204114, "step": 7130}, {"loss": 1.1132, "grad_norm": 1.4467198848724365, "learning_rate": 0.0002, "epoch": 5.541327124563446, "step": 7140}, {"loss": 1.1557, "grad_norm": 1.5961614847183228, "learning_rate": 0.0002, "epoch": 5.5490880869227786, "step": 7150}, {"loss": 1.0859, "grad_norm": 1.320656418800354, "learning_rate": 0.0002, "epoch": 5.5568490492821105, "step": 7160}, {"loss": 1.109, "grad_norm": 1.2423332929611206, "learning_rate": 0.0002, "epoch": 5.564610011641443, "step": 7170}, {"loss": 1.0046, "grad_norm": 1.2919669151306152, "learning_rate": 0.0002, "epoch": 5.572370974000776, "step": 7180}, {"loss": 1.046, "grad_norm": 1.1678385734558105, "learning_rate": 0.0002, "epoch": 5.580131936360108, "step": 7190}, {"loss": 1.1011, "grad_norm": 1.4250764846801758, "learning_rate": 0.0002, "epoch": 5.587892898719441, "step": 7200}, {"loss": 1.1254, "grad_norm": 1.5308716297149658, "learning_rate": 0.0002, "epoch": 5.595653861078774, "step": 7210}, {"loss": 1.121, "grad_norm": 1.2678815126419067, "learning_rate": 0.0002, "epoch": 5.603414823438106, "step": 7220}, {"loss": 1.0846, "grad_norm": 1.127856969833374, "learning_rate": 0.0002, "epoch": 5.611175785797439, "step": 7230}, {"loss": 1.0647, "grad_norm": 1.3832560777664185, "learning_rate": 0.0002, "epoch": 5.618936748156772, "step": 7240}, {"loss": 1.0658, "grad_norm": 1.3226919174194336, "learning_rate": 0.0002, "epoch": 5.626697710516104, "step": 7250}, {"loss": 1.1175, "grad_norm": 1.3418006896972656, "learning_rate": 0.0002, "epoch": 5.634458672875437, "step": 7260}, {"loss": 1.0956, "grad_norm": 1.2625300884246826, "learning_rate": 0.0002, "epoch": 5.642219635234769, "step": 7270}, {"loss": 1.067, "grad_norm": 1.1579464673995972, "learning_rate": 0.0002, "epoch": 5.6499805975941015, "step": 7280}, {"loss": 1.0447, "grad_norm": 1.4998650550842285, "learning_rate": 0.0002, "epoch": 5.657741559953434, "step": 7290}, {"loss": 1.1256, "grad_norm": 1.2670758962631226, "learning_rate": 0.0002, "epoch": 5.665502522312766, "step": 7300}, {"loss": 1.1267, "grad_norm": 1.2959760427474976, "learning_rate": 0.0002, "epoch": 5.673263484672099, "step": 7310}, {"loss": 1.1387, "grad_norm": 1.2460671663284302, "learning_rate": 0.0002, "epoch": 5.681024447031432, "step": 7320}, {"loss": 1.0756, "grad_norm": 1.1313989162445068, "learning_rate": 0.0002, "epoch": 5.688785409390764, "step": 7330}, {"loss": 1.0618, "grad_norm": 1.282527208328247, "learning_rate": 0.0002, "epoch": 5.696546371750097, "step": 7340}, {"loss": 1.1315, "grad_norm": 1.3380206823349, "learning_rate": 0.0002, "epoch": 5.70430733410943, "step": 7350}, {"loss": 1.0949, "grad_norm": 1.1648279428482056, "learning_rate": 0.0002, "epoch": 5.712068296468762, "step": 7360}, {"loss": 1.1705, "grad_norm": 1.3059816360473633, "learning_rate": 0.0002, "epoch": 5.719829258828095, "step": 7370}, {"loss": 1.1496, "grad_norm": 1.1905046701431274, "learning_rate": 0.0002, "epoch": 5.727590221187427, "step": 7380}, {"loss": 1.1356, "grad_norm": 1.4089630842208862, "learning_rate": 0.0002, "epoch": 5.73535118354676, "step": 7390}, {"loss": 1.1349, "grad_norm": 1.256721019744873, "learning_rate": 0.0002, "epoch": 5.7431121459060925, "step": 7400}, {"loss": 1.0682, "grad_norm": 1.1915162801742554, "learning_rate": 0.0002, "epoch": 5.7508731082654245, "step": 7410}, {"loss": 1.1257, "grad_norm": 1.1935480833053589, "learning_rate": 0.0002, "epoch": 5.758634070624757, "step": 7420}, {"loss": 1.1348, "grad_norm": 1.1761008501052856, "learning_rate": 0.0002, "epoch": 5.76639503298409, "step": 7430}, {"loss": 1.0837, "grad_norm": 1.2540549039840698, "learning_rate": 0.0002, "epoch": 5.774155995343422, "step": 7440}, {"loss": 1.1527, "grad_norm": 1.5295120477676392, "learning_rate": 0.0002, "epoch": 5.781916957702755, "step": 7450}, {"loss": 1.1146, "grad_norm": 1.1081160306930542, "learning_rate": 0.0002, "epoch": 5.789677920062088, "step": 7460}, {"loss": 1.1304, "grad_norm": 1.4381253719329834, "learning_rate": 0.0002, "epoch": 5.79743888242142, "step": 7470}, {"loss": 1.0684, "grad_norm": 1.3079341650009155, "learning_rate": 0.0002, "epoch": 5.805199844780753, "step": 7480}, {"loss": 1.0544, "grad_norm": 1.1372792720794678, "learning_rate": 0.0002, "epoch": 5.812960807140085, "step": 7490}, {"loss": 1.1622, "grad_norm": 1.3221744298934937, "learning_rate": 0.0002, "epoch": 5.820721769499418, "step": 7500}, {"loss": 1.1515, "grad_norm": 1.3436939716339111, "learning_rate": 0.0002, "epoch": 5.828482731858751, "step": 7510}, {"loss": 1.1154, "grad_norm": 1.3916879892349243, "learning_rate": 0.0002, "epoch": 5.8362436942180835, "step": 7520}, {"loss": 1.0816, "grad_norm": 1.2463704347610474, "learning_rate": 0.0002, "epoch": 5.8440046565774155, "step": 7530}, {"loss": 1.0745, "grad_norm": 1.097051739692688, "learning_rate": 0.0002, "epoch": 5.851765618936748, "step": 7540}, {"loss": 1.1454, "grad_norm": 1.1554739475250244, "learning_rate": 0.0002, "epoch": 5.85952658129608, "step": 7550}, {"loss": 1.0953, "grad_norm": 1.2384694814682007, "learning_rate": 0.0002, "epoch": 5.867287543655413, "step": 7560}, {"loss": 1.1734, "grad_norm": 1.142815351486206, "learning_rate": 0.0002, "epoch": 5.875048506014746, "step": 7570}, {"loss": 1.162, "grad_norm": 1.3637062311172485, "learning_rate": 0.0002, "epoch": 5.882809468374078, "step": 7580}, {"loss": 1.0781, "grad_norm": 1.2449073791503906, "learning_rate": 0.0002, "epoch": 5.890570430733411, "step": 7590}, {"loss": 1.1191, "grad_norm": 1.358058214187622, "learning_rate": 0.0002, "epoch": 5.898331393092743, "step": 7600}, {"loss": 1.0779, "grad_norm": 1.264655351638794, "learning_rate": 0.0002, "epoch": 5.906092355452076, "step": 7610}, {"loss": 1.1538, "grad_norm": 1.3186019659042358, "learning_rate": 0.0002, "epoch": 5.913853317811409, "step": 7620}, {"loss": 1.1076, "grad_norm": 1.4111460447311401, "learning_rate": 0.0002, "epoch": 5.921614280170742, "step": 7630}, {"loss": 1.1765, "grad_norm": 1.1078972816467285, "learning_rate": 0.0002, "epoch": 5.929375242530074, "step": 7640}, {"loss": 1.1305, "grad_norm": 1.2742213010787964, "learning_rate": 0.0002, "epoch": 5.9371362048894065, "step": 7650}, {"loss": 1.144, "grad_norm": 1.3412781953811646, "learning_rate": 0.0002, "epoch": 5.9448971672487385, "step": 7660}, {"loss": 1.1642, "grad_norm": 1.123005986213684, "learning_rate": 0.0002, "epoch": 5.952658129608071, "step": 7670}, {"loss": 1.0732, "grad_norm": 1.2203444242477417, "learning_rate": 0.0002, "epoch": 5.960419091967404, "step": 7680}, {"loss": 1.158, "grad_norm": 1.341011643409729, "learning_rate": 0.0002, "epoch": 5.968180054326736, "step": 7690}, {"loss": 1.1144, "grad_norm": 1.2689454555511475, "learning_rate": 0.0002, "epoch": 5.975941016686069, "step": 7700}, {"loss": 1.2051, "grad_norm": 1.1518112421035767, "learning_rate": 0.0002, "epoch": 5.983701979045401, "step": 7710}, {"loss": 1.1868, "grad_norm": 1.3698320388793945, "learning_rate": 0.0002, "epoch": 5.991462941404734, "step": 7720}, {"loss": 1.0651, "grad_norm": 1.2812788486480713, "learning_rate": 0.0002, "epoch": 5.999223903764067, "step": 7730}, {"eval_loss": 2.252762794494629, "eval_runtime": 114.8471, "eval_samples_per_second": 4.415, "eval_steps_per_second": 0.557, "epoch": 6.0, "step": 7731}, {"loss": 0.8629, "grad_norm": 1.8642009496688843, "learning_rate": 0.0002, "epoch": 6.006984866123399, "step": 7740}, {"loss": 0.8435, "grad_norm": 1.7081232070922852, "learning_rate": 0.0002, "epoch": 6.014745828482732, "step": 7750}, {"loss": 0.7729, "grad_norm": 1.6233899593353271, "learning_rate": 0.0002, "epoch": 6.022506790842065, "step": 7760}, {"loss": 0.7907, "grad_norm": 1.5111888647079468, "learning_rate": 0.0002, "epoch": 6.030267753201397, "step": 7770}, {"loss": 0.7908, "grad_norm": 1.5278418064117432, "learning_rate": 0.0002, "epoch": 6.0380287155607295, "step": 7780}, {"loss": 0.835, "grad_norm": 1.5932185649871826, "learning_rate": 0.0002, "epoch": 6.045789677920062, "step": 7790}, {"loss": 0.7682, "grad_norm": 1.5990597009658813, "learning_rate": 0.0002, "epoch": 6.053550640279394, "step": 7800}, {"loss": 0.8559, "grad_norm": 1.7498669624328613, "learning_rate": 0.0002, "epoch": 6.061311602638727, "step": 7810}, {"loss": 0.8069, "grad_norm": 1.6105555295944214, "learning_rate": 0.0002, "epoch": 6.06907256499806, "step": 7820}, {"loss": 0.8473, "grad_norm": 1.5214293003082275, "learning_rate": 0.0002, "epoch": 6.076833527357392, "step": 7830}, {"loss": 0.8328, "grad_norm": 1.6586973667144775, "learning_rate": 0.0002, "epoch": 6.084594489716725, "step": 7840}, {"loss": 0.8415, "grad_norm": 1.467391848564148, "learning_rate": 0.0002, "epoch": 6.092355452076057, "step": 7850}, {"loss": 0.8274, "grad_norm": 1.537361741065979, "learning_rate": 0.0002, "epoch": 6.10011641443539, "step": 7860}, {"loss": 0.8011, "grad_norm": 1.621764898300171, "learning_rate": 0.0002, "epoch": 6.107877376794723, "step": 7870}, {"loss": 0.8556, "grad_norm": 1.583751916885376, "learning_rate": 0.0002, "epoch": 6.115638339154055, "step": 7880}, {"loss": 0.8829, "grad_norm": 1.6199619770050049, "learning_rate": 0.0002, "epoch": 6.123399301513388, "step": 7890}, {"loss": 0.8226, "grad_norm": 1.6163095235824585, "learning_rate": 0.0002, "epoch": 6.1311602638727205, "step": 7900}, {"loss": 0.8203, "grad_norm": 1.6120976209640503, "learning_rate": 0.0002, "epoch": 6.1389212262320525, "step": 7910}, {"loss": 0.7915, "grad_norm": 1.7886850833892822, "learning_rate": 0.0002, "epoch": 6.146682188591385, "step": 7920}, {"loss": 0.7808, "grad_norm": 1.408303141593933, "learning_rate": 0.0002, "epoch": 6.154443150950718, "step": 7930}, {"loss": 0.8404, "grad_norm": 1.6048113107681274, "learning_rate": 0.0002, "epoch": 6.16220411331005, "step": 7940}, {"loss": 0.8705, "grad_norm": 1.424306869506836, "learning_rate": 0.0002, "epoch": 6.169965075669383, "step": 7950}, {"loss": 0.8177, "grad_norm": 1.4453672170639038, "learning_rate": 0.0002, "epoch": 6.177726038028716, "step": 7960}, {"loss": 0.8182, "grad_norm": 1.3157061338424683, "learning_rate": 0.0002, "epoch": 6.185487000388048, "step": 7970}, {"loss": 0.891, "grad_norm": 1.330541729927063, "learning_rate": 0.0002, "epoch": 6.193247962747381, "step": 7980}, {"loss": 0.8599, "grad_norm": 1.6306229829788208, "learning_rate": 0.0002, "epoch": 6.201008925106713, "step": 7990}, {"loss": 0.9069, "grad_norm": 1.6332136392593384, "learning_rate": 0.0002, "epoch": 6.208769887466046, "step": 8000}, {"loss": 0.83, "grad_norm": 1.708613395690918, "learning_rate": 0.0002, "epoch": 6.216530849825379, "step": 8010}, {"loss": 0.8509, "grad_norm": 1.6637346744537354, "learning_rate": 0.0002, "epoch": 6.224291812184711, "step": 8020}, {"loss": 0.84, "grad_norm": 1.5675315856933594, "learning_rate": 0.0002, "epoch": 6.2320527745440435, "step": 8030}, {"loss": 0.8491, "grad_norm": 1.5826327800750732, "learning_rate": 0.0002, "epoch": 6.239813736903376, "step": 8040}, {"loss": 0.8374, "grad_norm": 1.7382984161376953, "learning_rate": 0.0002, "epoch": 6.247574699262708, "step": 8050}, {"loss": 0.8795, "grad_norm": 1.5272295475006104, "learning_rate": 0.0002, "epoch": 6.255335661622041, "step": 8060}, {"loss": 0.8745, "grad_norm": 1.8195022344589233, "learning_rate": 0.0002, "epoch": 6.263096623981374, "step": 8070}, {"loss": 0.8743, "grad_norm": 1.679901361465454, "learning_rate": 0.0002, "epoch": 6.270857586340706, "step": 8080}, {"loss": 0.9006, "grad_norm": 1.4921348094940186, "learning_rate": 0.0002, "epoch": 6.278618548700039, "step": 8090}, {"loss": 0.899, "grad_norm": 1.4627857208251953, "learning_rate": 0.0002, "epoch": 6.286379511059371, "step": 8100}, {"loss": 0.8944, "grad_norm": 1.3528631925582886, "learning_rate": 0.0002, "epoch": 6.294140473418704, "step": 8110}, {"loss": 0.9355, "grad_norm": 1.6863102912902832, "learning_rate": 0.0002, "epoch": 6.301901435778037, "step": 8120}, {"loss": 0.8764, "grad_norm": 1.6178052425384521, "learning_rate": 0.0002, "epoch": 6.309662398137369, "step": 8130}, {"loss": 0.9182, "grad_norm": 1.7626280784606934, "learning_rate": 0.0002, "epoch": 6.317423360496702, "step": 8140}, {"loss": 0.8886, "grad_norm": 1.7188845872879028, "learning_rate": 0.0002, "epoch": 6.3251843228560345, "step": 8150}, {"loss": 0.895, "grad_norm": 1.5777133703231812, "learning_rate": 0.0002, "epoch": 6.3329452852153665, "step": 8160}, {"loss": 0.9247, "grad_norm": 1.7653207778930664, "learning_rate": 0.0002, "epoch": 6.340706247574699, "step": 8170}, {"loss": 0.8003, "grad_norm": 1.6861237287521362, "learning_rate": 0.0002, "epoch": 6.348467209934032, "step": 8180}, {"loss": 0.884, "grad_norm": 1.6318124532699585, "learning_rate": 0.0002, "epoch": 6.356228172293364, "step": 8190}, {"loss": 0.8341, "grad_norm": 1.6192939281463623, "learning_rate": 0.0002, "epoch": 6.363989134652697, "step": 8200}, {"loss": 0.8939, "grad_norm": 1.7641773223876953, "learning_rate": 0.0002, "epoch": 6.371750097012029, "step": 8210}, {"loss": 0.8582, "grad_norm": 1.6470493078231812, "learning_rate": 0.0002, "epoch": 6.379511059371362, "step": 8220}, {"loss": 0.9351, "grad_norm": 1.5898468494415283, "learning_rate": 0.0002, "epoch": 6.387272021730695, "step": 8230}, {"loss": 0.9658, "grad_norm": 1.8025981187820435, "learning_rate": 0.0002, "epoch": 6.395032984090027, "step": 8240}, {"loss": 0.8953, "grad_norm": 1.7035106420516968, "learning_rate": 0.0002, "epoch": 6.40279394644936, "step": 8250}, {"loss": 0.9193, "grad_norm": 1.5968799591064453, "learning_rate": 0.0002, "epoch": 6.410554908808693, "step": 8260}, {"loss": 0.929, "grad_norm": 1.7492800951004028, "learning_rate": 0.0002, "epoch": 6.418315871168025, "step": 8270}, {"loss": 0.9297, "grad_norm": 1.6914138793945312, "learning_rate": 0.0002, "epoch": 6.4260768335273575, "step": 8280}, {"loss": 0.8878, "grad_norm": 1.5761380195617676, "learning_rate": 0.0002, "epoch": 6.43383779588669, "step": 8290}, {"loss": 0.8761, "grad_norm": 1.5164411067962646, "learning_rate": 0.0002, "epoch": 6.441598758246022, "step": 8300}, {"loss": 0.88, "grad_norm": 1.6600215435028076, "learning_rate": 0.0002, "epoch": 6.449359720605355, "step": 8310}, {"loss": 0.9113, "grad_norm": 1.2477679252624512, "learning_rate": 0.0002, "epoch": 6.457120682964687, "step": 8320}, {"loss": 0.8822, "grad_norm": 1.3698599338531494, "learning_rate": 0.0002, "epoch": 6.46488164532402, "step": 8330}, {"loss": 0.9295, "grad_norm": 1.4847341775894165, "learning_rate": 0.0002, "epoch": 6.472642607683353, "step": 8340}, {"loss": 0.9243, "grad_norm": 1.4713412523269653, "learning_rate": 0.0002, "epoch": 6.480403570042685, "step": 8350}, {"loss": 0.9102, "grad_norm": 1.334523320198059, "learning_rate": 0.0002, "epoch": 6.488164532402018, "step": 8360}, {"loss": 0.8563, "grad_norm": 2.0054359436035156, "learning_rate": 0.0002, "epoch": 6.495925494761351, "step": 8370}, {"loss": 0.9759, "grad_norm": 1.560014247894287, "learning_rate": 0.0002, "epoch": 6.503686457120683, "step": 8380}, {"loss": 0.8542, "grad_norm": 1.518526554107666, "learning_rate": 0.0002, "epoch": 6.511447419480016, "step": 8390}, {"loss": 0.937, "grad_norm": 1.3841272592544556, "learning_rate": 0.0002, "epoch": 6.5192083818393485, "step": 8400}, {"loss": 0.9576, "grad_norm": 1.5191527605056763, "learning_rate": 0.0002, "epoch": 6.5269693441986805, "step": 8410}, {"loss": 0.8899, "grad_norm": 1.5275579690933228, "learning_rate": 0.0002, "epoch": 6.534730306558013, "step": 8420}, {"loss": 0.9291, "grad_norm": 1.621590256690979, "learning_rate": 0.0002, "epoch": 6.542491268917345, "step": 8430}, {"loss": 0.9011, "grad_norm": 1.7939082384109497, "learning_rate": 0.0002, "epoch": 6.550252231276678, "step": 8440}, {"loss": 0.8896, "grad_norm": 1.4542964696884155, "learning_rate": 0.0002, "epoch": 6.558013193636011, "step": 8450}, {"loss": 0.9393, "grad_norm": 1.5458455085754395, "learning_rate": 0.0002, "epoch": 6.565774155995343, "step": 8460}, {"loss": 0.9028, "grad_norm": 1.550359845161438, "learning_rate": 0.0002, "epoch": 6.573535118354676, "step": 8470}, {"loss": 0.9271, "grad_norm": 1.527757167816162, "learning_rate": 0.0002, "epoch": 6.581296080714009, "step": 8480}, {"loss": 0.966, "grad_norm": 1.4683486223220825, "learning_rate": 0.0002, "epoch": 6.589057043073341, "step": 8490}, {"loss": 0.9079, "grad_norm": 1.5057084560394287, "learning_rate": 0.0002, "epoch": 6.596818005432674, "step": 8500}, {"loss": 0.9235, "grad_norm": 1.648289442062378, "learning_rate": 0.0002, "epoch": 6.604578967792007, "step": 8510}, {"loss": 0.9113, "grad_norm": 1.578914761543274, "learning_rate": 0.0002, "epoch": 6.612339930151339, "step": 8520}, {"loss": 0.8894, "grad_norm": 1.5064080953598022, "learning_rate": 0.0002, "epoch": 6.6201008925106715, "step": 8530}, {"loss": 0.8981, "grad_norm": 1.5717744827270508, "learning_rate": 0.0002, "epoch": 6.6278618548700035, "step": 8540}, {"loss": 0.887, "grad_norm": 1.7954767942428589, "learning_rate": 0.0002, "epoch": 6.635622817229336, "step": 8550}, {"loss": 0.927, "grad_norm": 1.6172343492507935, "learning_rate": 0.0002, "epoch": 6.643383779588669, "step": 8560}, {"loss": 0.9384, "grad_norm": 1.6627886295318604, "learning_rate": 0.0002, "epoch": 6.651144741948001, "step": 8570}, {"loss": 0.959, "grad_norm": 1.5264919996261597, "learning_rate": 0.0002, "epoch": 6.658905704307334, "step": 8580}, {"loss": 0.9103, "grad_norm": 1.609248161315918, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 8590}, {"loss": 0.9395, "grad_norm": 1.5474581718444824, "learning_rate": 0.0002, "epoch": 6.674427629025999, "step": 8600}, {"loss": 0.9018, "grad_norm": 1.6294898986816406, "learning_rate": 0.0002, "epoch": 6.682188591385332, "step": 8610}, {"loss": 0.9323, "grad_norm": 1.612615942955017, "learning_rate": 0.0002, "epoch": 6.689949553744665, "step": 8620}, {"loss": 0.9218, "grad_norm": 1.741325855255127, "learning_rate": 0.0002, "epoch": 6.697710516103997, "step": 8630}, {"loss": 1.0475, "grad_norm": 1.5089004039764404, "learning_rate": 0.0002, "epoch": 6.70547147846333, "step": 8640}, {"loss": 1.0009, "grad_norm": 1.4725582599639893, "learning_rate": 0.0002, "epoch": 6.713232440822662, "step": 8650}, {"loss": 0.9818, "grad_norm": 1.6992095708847046, "learning_rate": 0.0002, "epoch": 6.7209934031819945, "step": 8660}, {"loss": 0.9229, "grad_norm": 1.5938470363616943, "learning_rate": 0.0002, "epoch": 6.728754365541327, "step": 8670}, {"loss": 0.9411, "grad_norm": 1.58723783493042, "learning_rate": 0.0002, "epoch": 6.736515327900659, "step": 8680}, {"loss": 0.9738, "grad_norm": 1.514389991760254, "learning_rate": 0.0002, "epoch": 6.744276290259992, "step": 8690}, {"loss": 0.9283, "grad_norm": 1.6799157857894897, "learning_rate": 0.0002, "epoch": 6.752037252619325, "step": 8700}, {"loss": 0.9138, "grad_norm": 1.5436359643936157, "learning_rate": 0.0002, "epoch": 6.759798214978657, "step": 8710}, {"loss": 0.9678, "grad_norm": 1.477137565612793, "learning_rate": 0.0002, "epoch": 6.76755917733799, "step": 8720}, {"loss": 1.0044, "grad_norm": 1.7383503913879395, "learning_rate": 0.0002, "epoch": 6.775320139697323, "step": 8730}, {"loss": 0.9492, "grad_norm": 1.8000324964523315, "learning_rate": 0.0002, "epoch": 6.783081102056655, "step": 8740}, {"loss": 0.8943, "grad_norm": 1.3099453449249268, "learning_rate": 0.0002, "epoch": 6.790842064415988, "step": 8750}, {"loss": 0.9709, "grad_norm": 1.8775172233581543, "learning_rate": 0.0002, "epoch": 6.79860302677532, "step": 8760}, {"loss": 0.9356, "grad_norm": 1.5832085609436035, "learning_rate": 0.0002, "epoch": 6.806363989134653, "step": 8770}, {"loss": 0.9397, "grad_norm": 1.4903252124786377, "learning_rate": 0.0002, "epoch": 6.8141249514939854, "step": 8780}, {"loss": 0.9602, "grad_norm": 1.6360470056533813, "learning_rate": 0.0002, "epoch": 6.821885913853317, "step": 8790}, {"loss": 0.957, "grad_norm": 1.5457707643508911, "learning_rate": 0.0002, "epoch": 6.82964687621265, "step": 8800}, {"loss": 0.943, "grad_norm": 1.5449066162109375, "learning_rate": 0.0002, "epoch": 6.837407838571983, "step": 8810}, {"loss": 1.0007, "grad_norm": 1.618337631225586, "learning_rate": 0.0002, "epoch": 6.845168800931315, "step": 8820}, {"loss": 0.9314, "grad_norm": 1.38296639919281, "learning_rate": 0.0002, "epoch": 6.852929763290648, "step": 8830}, {"loss": 0.9349, "grad_norm": 1.6427991390228271, "learning_rate": 0.0002, "epoch": 6.860690725649981, "step": 8840}, {"loss": 1.0194, "grad_norm": 1.4980270862579346, "learning_rate": 0.0002, "epoch": 6.868451688009313, "step": 8850}, {"loss": 0.9541, "grad_norm": 1.3800020217895508, "learning_rate": 0.0002, "epoch": 6.876212650368646, "step": 8860}, {"loss": 1.0102, "grad_norm": 1.5971838235855103, "learning_rate": 0.0002, "epoch": 6.883973612727978, "step": 8870}, {"loss": 1.0105, "grad_norm": 1.4429489374160767, "learning_rate": 0.0002, "epoch": 6.891734575087311, "step": 8880}, {"loss": 0.9143, "grad_norm": 1.4959166049957275, "learning_rate": 0.0002, "epoch": 6.899495537446644, "step": 8890}, {"loss": 0.9403, "grad_norm": 1.5776222944259644, "learning_rate": 0.0002, "epoch": 6.907256499805976, "step": 8900}, {"loss": 0.9256, "grad_norm": 1.510412573814392, "learning_rate": 0.0002, "epoch": 6.915017462165308, "step": 8910}, {"loss": 1.0095, "grad_norm": 1.7216295003890991, "learning_rate": 0.0002, "epoch": 6.922778424524641, "step": 8920}, {"loss": 0.9464, "grad_norm": 1.830762505531311, "learning_rate": 0.0002, "epoch": 6.930539386883973, "step": 8930}, {"loss": 0.9704, "grad_norm": 1.3472434282302856, "learning_rate": 0.0002, "epoch": 6.938300349243306, "step": 8940}, {"loss": 0.9718, "grad_norm": 1.5748040676116943, "learning_rate": 0.0002, "epoch": 6.946061311602639, "step": 8950}, {"loss": 0.9891, "grad_norm": 1.5317506790161133, "learning_rate": 0.0002, "epoch": 6.953822273961971, "step": 8960}, {"loss": 0.9513, "grad_norm": 1.5565721988677979, "learning_rate": 0.0002, "epoch": 6.961583236321304, "step": 8970}, {"loss": 0.9118, "grad_norm": 1.5288970470428467, "learning_rate": 0.0002, "epoch": 6.969344198680636, "step": 8980}, {"loss": 0.9789, "grad_norm": 1.562624454498291, "learning_rate": 0.0002, "epoch": 6.977105161039969, "step": 8990}, {"loss": 0.9929, "grad_norm": 1.3777633905410767, "learning_rate": 0.0002, "epoch": 6.984866123399302, "step": 9000}, {"loss": 0.9713, "grad_norm": 1.5868972539901733, "learning_rate": 0.0002, "epoch": 6.992627085758635, "step": 9010}]} +{"epoch": 7.996895615056267, "step": 10304, "epoch_duration": 3950.4752156734467, "total_accumulated_duration": 29862.658538103104, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-10000/checkpoint-2577", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0855, "grad_norm": 1.0751162767410278, "learning_rate": 0.0002, "epoch": 0.007760962359332557, "step": 10}, {"loss": 2.4744, "grad_norm": 0.4697345793247223, "learning_rate": 0.0002, "epoch": 0.015521924718665115, "step": 20}, {"loss": 2.193, "grad_norm": 0.5370839238166809, "learning_rate": 0.0002, "epoch": 0.023282887077997673, "step": 30}, {"loss": 2.0599, "grad_norm": 0.46794816851615906, "learning_rate": 0.0002, "epoch": 0.03104384943733023, "step": 40}, {"loss": 1.9354, "grad_norm": 0.44624820351600647, "learning_rate": 0.0002, "epoch": 0.038804811796662786, "step": 50}, {"loss": 1.9319, "grad_norm": 0.3953201472759247, "learning_rate": 0.0002, "epoch": 0.046565774155995346, "step": 60}, {"loss": 1.9099, "grad_norm": 0.3935912549495697, "learning_rate": 0.0002, "epoch": 0.0543267365153279, "step": 70}, {"loss": 1.8795, "grad_norm": 0.4520699381828308, "learning_rate": 0.0002, "epoch": 0.06208769887466046, "step": 80}, {"loss": 1.8354, "grad_norm": 0.3801847994327545, "learning_rate": 0.0002, "epoch": 0.06984866123399301, "step": 90}, {"loss": 1.9053, "grad_norm": 0.4020165205001831, "learning_rate": 0.0002, "epoch": 0.07760962359332557, "step": 100}, {"loss": 1.8779, "grad_norm": 0.3860672116279602, "learning_rate": 0.0002, "epoch": 0.08537058595265813, "step": 110}, {"loss": 1.8731, "grad_norm": 0.3681113123893738, "learning_rate": 0.0002, "epoch": 0.09313154831199069, "step": 120}, {"loss": 1.8157, "grad_norm": 0.3594866991043091, "learning_rate": 0.0002, "epoch": 0.10089251067132324, "step": 130}, {"loss": 1.8266, "grad_norm": 0.3879193663597107, "learning_rate": 0.0002, "epoch": 0.1086534730306558, "step": 140}, {"loss": 1.8818, "grad_norm": 0.3270505666732788, "learning_rate": 0.0002, "epoch": 0.11641443538998836, "step": 150}, {"loss": 1.87, "grad_norm": 0.36824458837509155, "learning_rate": 0.0002, "epoch": 0.12417539774932092, "step": 160}, {"loss": 1.8305, "grad_norm": 0.383882075548172, "learning_rate": 0.0002, "epoch": 0.13193636010865348, "step": 170}, {"loss": 1.8584, "grad_norm": 0.3368665874004364, "learning_rate": 0.0002, "epoch": 0.13969732246798602, "step": 180}, {"loss": 1.7882, "grad_norm": 0.35961097478866577, "learning_rate": 0.0002, "epoch": 0.1474582848273186, "step": 190}, {"loss": 1.8467, "grad_norm": 0.3415963351726532, "learning_rate": 0.0002, "epoch": 0.15521924718665114, "step": 200}, {"loss": 1.8543, "grad_norm": 0.4100632071495056, "learning_rate": 0.0002, "epoch": 0.1629802095459837, "step": 210}, {"loss": 1.8226, "grad_norm": 0.3516307473182678, "learning_rate": 0.0002, "epoch": 0.17074117190531626, "step": 220}, {"loss": 1.7386, "grad_norm": 0.37919050455093384, "learning_rate": 0.0002, "epoch": 0.1785021342646488, "step": 230}, {"loss": 1.7937, "grad_norm": 0.33270683884620667, "learning_rate": 0.0002, "epoch": 0.18626309662398138, "step": 240}, {"loss": 1.7925, "grad_norm": 0.3348783254623413, "learning_rate": 0.0002, "epoch": 0.19402405898331393, "step": 250}, {"loss": 1.7774, "grad_norm": 0.3888475298881531, "learning_rate": 0.0002, "epoch": 0.20178502134264648, "step": 260}, {"loss": 1.8381, "grad_norm": 0.3554602861404419, "learning_rate": 0.0002, "epoch": 0.20954598370197905, "step": 270}, {"loss": 1.8359, "grad_norm": 0.33277708292007446, "learning_rate": 0.0002, "epoch": 0.2173069460613116, "step": 280}, {"loss": 1.7713, "grad_norm": 0.3281584680080414, "learning_rate": 0.0002, "epoch": 0.22506790842064417, "step": 290}, {"loss": 1.8181, "grad_norm": 0.3185969591140747, "learning_rate": 0.0002, "epoch": 0.23282887077997672, "step": 300}, {"loss": 1.8595, "grad_norm": 0.35335442423820496, "learning_rate": 0.0002, "epoch": 0.24058983313930926, "step": 310}, {"loss": 1.87, "grad_norm": 0.3119595944881439, "learning_rate": 0.0002, "epoch": 0.24835079549864184, "step": 320}, {"loss": 1.8357, "grad_norm": 0.36424458026885986, "learning_rate": 0.0002, "epoch": 0.2561117578579744, "step": 330}, {"loss": 1.8003, "grad_norm": 0.3618951141834259, "learning_rate": 0.0002, "epoch": 0.26387272021730696, "step": 340}, {"loss": 1.8221, "grad_norm": 0.312757670879364, "learning_rate": 0.0002, "epoch": 0.2716336825766395, "step": 350}, {"loss": 1.9031, "grad_norm": 0.326016366481781, "learning_rate": 0.0002, "epoch": 0.27939464493597205, "step": 360}, {"loss": 1.8214, "grad_norm": 0.34093883633613586, "learning_rate": 0.0002, "epoch": 0.2871556072953046, "step": 370}, {"loss": 1.7733, "grad_norm": 0.32325029373168945, "learning_rate": 0.0002, "epoch": 0.2949165696546372, "step": 380}, {"loss": 1.842, "grad_norm": 0.34105437994003296, "learning_rate": 0.0002, "epoch": 0.30267753201396974, "step": 390}, {"loss": 1.7926, "grad_norm": 0.32565295696258545, "learning_rate": 0.0002, "epoch": 0.3104384943733023, "step": 400}, {"loss": 1.8031, "grad_norm": 0.32742050290107727, "learning_rate": 0.0002, "epoch": 0.31819945673263483, "step": 410}, {"loss": 1.907, "grad_norm": 0.30233046412467957, "learning_rate": 0.0002, "epoch": 0.3259604190919674, "step": 420}, {"loss": 1.7623, "grad_norm": 0.32419222593307495, "learning_rate": 0.0002, "epoch": 0.3337213814513, "step": 430}, {"loss": 1.865, "grad_norm": 0.3653007745742798, "learning_rate": 0.0002, "epoch": 0.3414823438106325, "step": 440}, {"loss": 1.8044, "grad_norm": 0.31617099046707153, "learning_rate": 0.0002, "epoch": 0.3492433061699651, "step": 450}, {"loss": 1.7677, "grad_norm": 0.3305962085723877, "learning_rate": 0.0002, "epoch": 0.3570042685292976, "step": 460}, {"loss": 1.8155, "grad_norm": 0.3178933262825012, "learning_rate": 0.0002, "epoch": 0.36476523088863017, "step": 470}, {"loss": 1.7485, "grad_norm": 0.37163782119750977, "learning_rate": 0.0002, "epoch": 0.37252619324796277, "step": 480}, {"loss": 1.8804, "grad_norm": 0.469844788312912, "learning_rate": 0.0002, "epoch": 0.3802871556072953, "step": 490}, {"loss": 1.8343, "grad_norm": 0.3409338593482971, "learning_rate": 0.0002, "epoch": 0.38804811796662786, "step": 500}, {"loss": 1.8433, "grad_norm": 0.31943467259407043, "learning_rate": 0.0002, "epoch": 0.3958090803259604, "step": 510}, {"loss": 1.7873, "grad_norm": 0.32293614745140076, "learning_rate": 0.0002, "epoch": 0.40357004268529295, "step": 520}, {"loss": 1.8584, "grad_norm": 0.2994382977485657, "learning_rate": 0.0002, "epoch": 0.41133100504462555, "step": 530}, {"loss": 1.8153, "grad_norm": 0.3273141384124756, "learning_rate": 0.0002, "epoch": 0.4190919674039581, "step": 540}, {"loss": 1.8097, "grad_norm": 0.3020550012588501, "learning_rate": 0.0002, "epoch": 0.42685292976329064, "step": 550}, {"loss": 1.8551, "grad_norm": 0.30113112926483154, "learning_rate": 0.0002, "epoch": 0.4346138921226232, "step": 560}, {"loss": 1.8084, "grad_norm": 0.30274903774261475, "learning_rate": 0.0002, "epoch": 0.44237485448195574, "step": 570}, {"loss": 1.7673, "grad_norm": 0.3231128454208374, "learning_rate": 0.0002, "epoch": 0.45013581684128834, "step": 580}, {"loss": 1.7848, "grad_norm": 0.3255121409893036, "learning_rate": 0.0002, "epoch": 0.4578967792006209, "step": 590}, {"loss": 1.8227, "grad_norm": 0.30147507786750793, "learning_rate": 0.0002, "epoch": 0.46565774155995343, "step": 600}, {"loss": 1.7572, "grad_norm": 0.29781386256217957, "learning_rate": 0.0002, "epoch": 0.473418703919286, "step": 610}, {"loss": 1.8307, "grad_norm": 0.30914685130119324, "learning_rate": 0.0002, "epoch": 0.4811796662786185, "step": 620}, {"loss": 1.805, "grad_norm": 0.3110593855381012, "learning_rate": 0.0002, "epoch": 0.4889406286379511, "step": 630}, {"loss": 1.8228, "grad_norm": 0.3298132121562958, "learning_rate": 0.0002, "epoch": 0.49670159099728367, "step": 640}, {"loss": 1.7816, "grad_norm": 0.322122186422348, "learning_rate": 0.0002, "epoch": 0.5044625533566163, "step": 650}, {"loss": 1.8001, "grad_norm": 0.3504371643066406, "learning_rate": 0.0002, "epoch": 0.5122235157159488, "step": 660}, {"loss": 1.8682, "grad_norm": 0.3102182149887085, "learning_rate": 0.0002, "epoch": 0.5199844780752814, "step": 670}, {"loss": 1.7494, "grad_norm": 0.6113658547401428, "learning_rate": 0.0002, "epoch": 0.5277454404346139, "step": 680}, {"loss": 1.7096, "grad_norm": 0.31841862201690674, "learning_rate": 0.0002, "epoch": 0.5355064027939465, "step": 690}, {"loss": 1.7587, "grad_norm": 0.2830526530742645, "learning_rate": 0.0002, "epoch": 0.543267365153279, "step": 700}, {"loss": 1.7887, "grad_norm": 0.3048769533634186, "learning_rate": 0.0002, "epoch": 0.5510283275126115, "step": 710}, {"loss": 1.8416, "grad_norm": 0.2719033658504486, "learning_rate": 0.0002, "epoch": 0.5587892898719441, "step": 720}, {"loss": 1.786, "grad_norm": 0.3176722526550293, "learning_rate": 0.0002, "epoch": 0.5665502522312766, "step": 730}, {"loss": 1.7127, "grad_norm": 0.32491734623908997, "learning_rate": 0.0002, "epoch": 0.5743112145906092, "step": 740}, {"loss": 1.7892, "grad_norm": 0.32746851444244385, "learning_rate": 0.0002, "epoch": 0.5820721769499418, "step": 750}, {"loss": 1.7811, "grad_norm": 0.3055773973464966, "learning_rate": 0.0002, "epoch": 0.5898331393092744, "step": 760}, {"loss": 1.8597, "grad_norm": 0.30671584606170654, "learning_rate": 0.0002, "epoch": 0.5975941016686069, "step": 770}, {"loss": 1.7728, "grad_norm": 0.28770264983177185, "learning_rate": 0.0002, "epoch": 0.6053550640279395, "step": 780}, {"loss": 1.7025, "grad_norm": 0.2814285457134247, "learning_rate": 0.0002, "epoch": 0.613116026387272, "step": 790}, {"loss": 1.819, "grad_norm": 0.31554412841796875, "learning_rate": 0.0002, "epoch": 0.6208769887466046, "step": 800}, {"loss": 1.8335, "grad_norm": 0.2984226942062378, "learning_rate": 0.0002, "epoch": 0.6286379511059371, "step": 810}, {"loss": 1.7728, "grad_norm": 0.2859906554222107, "learning_rate": 0.0002, "epoch": 0.6363989134652697, "step": 820}, {"loss": 1.7408, "grad_norm": 0.2887928783893585, "learning_rate": 0.0002, "epoch": 0.6441598758246022, "step": 830}, {"loss": 1.8071, "grad_norm": 0.31287339329719543, "learning_rate": 0.0002, "epoch": 0.6519208381839348, "step": 840}, {"loss": 1.7673, "grad_norm": 0.32064181566238403, "learning_rate": 0.0002, "epoch": 0.6596818005432674, "step": 850}, {"loss": 1.7849, "grad_norm": 0.290981650352478, "learning_rate": 0.0002, "epoch": 0.6674427629026, "step": 860}, {"loss": 1.8089, "grad_norm": 0.33060121536254883, "learning_rate": 0.0002, "epoch": 0.6752037252619325, "step": 870}, {"loss": 1.7357, "grad_norm": 0.27032899856567383, "learning_rate": 0.0002, "epoch": 0.682964687621265, "step": 880}, {"loss": 1.8423, "grad_norm": 0.29031234979629517, "learning_rate": 0.0002, "epoch": 0.6907256499805976, "step": 890}, {"loss": 1.835, "grad_norm": 0.2845142185688019, "learning_rate": 0.0002, "epoch": 0.6984866123399301, "step": 900}, {"loss": 1.77, "grad_norm": 0.8638312816619873, "learning_rate": 0.0002, "epoch": 0.7062475746992627, "step": 910}, {"loss": 1.7757, "grad_norm": 0.3086668848991394, "learning_rate": 0.0002, "epoch": 0.7140085370585952, "step": 920}, {"loss": 1.7541, "grad_norm": 0.2724177837371826, "learning_rate": 0.0002, "epoch": 0.7217694994179278, "step": 930}, {"loss": 1.816, "grad_norm": 0.289559006690979, "learning_rate": 0.0002, "epoch": 0.7295304617772603, "step": 940}, {"loss": 1.7654, "grad_norm": 0.3000658452510834, "learning_rate": 0.0002, "epoch": 0.737291424136593, "step": 950}, {"loss": 1.7736, "grad_norm": 0.33544042706489563, "learning_rate": 0.0002, "epoch": 0.7450523864959255, "step": 960}, {"loss": 1.6979, "grad_norm": 0.28593236207962036, "learning_rate": 0.0002, "epoch": 0.7528133488552581, "step": 970}, {"loss": 1.8583, "grad_norm": 0.313634991645813, "learning_rate": 0.0002, "epoch": 0.7605743112145906, "step": 980}, {"loss": 1.7473, "grad_norm": 0.2949385941028595, "learning_rate": 0.0002, "epoch": 0.7683352735739232, "step": 990}, {"loss": 1.8689, "grad_norm": 0.2920108437538147, "learning_rate": 0.0002, "epoch": 0.7760962359332557, "step": 1000}, {"loss": 1.8401, "grad_norm": 0.3245100677013397, "learning_rate": 0.0002, "epoch": 0.7838571982925883, "step": 1010}, {"loss": 1.7109, "grad_norm": 0.3007619380950928, "learning_rate": 0.0002, "epoch": 0.7916181606519208, "step": 1020}, {"loss": 1.7427, "grad_norm": 0.3630852997303009, "learning_rate": 0.0002, "epoch": 0.7993791230112534, "step": 1030}, {"loss": 1.7655, "grad_norm": 0.2856379747390747, "learning_rate": 0.0002, "epoch": 0.8071400853705859, "step": 1040}, {"loss": 1.8371, "grad_norm": 0.32476478815078735, "learning_rate": 0.0002, "epoch": 0.8149010477299186, "step": 1050}, {"loss": 1.8039, "grad_norm": 0.5162565112113953, "learning_rate": 0.0002, "epoch": 0.8226620100892511, "step": 1060}, {"loss": 1.8862, "grad_norm": 0.316496342420578, "learning_rate": 0.0002, "epoch": 0.8304229724485837, "step": 1070}, {"loss": 1.8023, "grad_norm": 0.31977516412734985, "learning_rate": 0.0002, "epoch": 0.8381839348079162, "step": 1080}, {"loss": 1.8547, "grad_norm": 0.269509494304657, "learning_rate": 0.0002, "epoch": 0.8459448971672487, "step": 1090}, {"loss": 1.7811, "grad_norm": 0.31621453166007996, "learning_rate": 0.0002, "epoch": 0.8537058595265813, "step": 1100}, {"loss": 1.739, "grad_norm": 0.2946535050868988, "learning_rate": 0.0002, "epoch": 0.8614668218859138, "step": 1110}, {"loss": 1.7511, "grad_norm": 0.3088909983634949, "learning_rate": 0.0002, "epoch": 0.8692277842452464, "step": 1120}, {"loss": 1.8228, "grad_norm": 0.33033716678619385, "learning_rate": 0.0002, "epoch": 0.8769887466045789, "step": 1130}, {"loss": 1.7912, "grad_norm": 0.2954833507537842, "learning_rate": 0.0002, "epoch": 0.8847497089639115, "step": 1140}, {"loss": 1.8394, "grad_norm": 0.2950248122215271, "learning_rate": 0.0002, "epoch": 0.8925106713232441, "step": 1150}, {"loss": 1.7068, "grad_norm": 0.296661913394928, "learning_rate": 0.0002, "epoch": 0.9002716336825767, "step": 1160}, {"loss": 1.7967, "grad_norm": 0.35451310873031616, "learning_rate": 0.0002, "epoch": 0.9080325960419092, "step": 1170}, {"loss": 1.8202, "grad_norm": 0.32705947756767273, "learning_rate": 0.0002, "epoch": 0.9157935584012418, "step": 1180}, {"loss": 1.7396, "grad_norm": 0.3333960771560669, "learning_rate": 0.0002, "epoch": 0.9235545207605743, "step": 1190}, {"loss": 1.7801, "grad_norm": 0.3042232096195221, "learning_rate": 0.0002, "epoch": 0.9313154831199069, "step": 1200}, {"loss": 1.7586, "grad_norm": 0.281553715467453, "learning_rate": 0.0002, "epoch": 0.9390764454792394, "step": 1210}, {"loss": 1.7953, "grad_norm": 0.3096391558647156, "learning_rate": 0.0002, "epoch": 0.946837407838572, "step": 1220}, {"loss": 1.7401, "grad_norm": 0.2866271734237671, "learning_rate": 0.0002, "epoch": 0.9545983701979045, "step": 1230}, {"loss": 1.7211, "grad_norm": 0.28394097089767456, "learning_rate": 0.0002, "epoch": 0.962359332557237, "step": 1240}, {"loss": 1.7363, "grad_norm": 0.3249266743659973, "learning_rate": 0.0002, "epoch": 0.9701202949165697, "step": 1250}, {"loss": 1.7563, "grad_norm": 0.2896869480609894, "learning_rate": 0.0002, "epoch": 0.9778812572759022, "step": 1260}, {"loss": 1.6389, "grad_norm": 0.29224586486816406, "learning_rate": 0.0002, "epoch": 0.9856422196352348, "step": 1270}, {"loss": 1.7111, "grad_norm": 0.2820223569869995, "learning_rate": 0.0002, "epoch": 0.9934031819945673, "step": 1280}, {"eval_loss": 1.8081045150756836, "eval_runtime": 102.3056, "eval_samples_per_second": 4.956, "eval_steps_per_second": 0.626, "epoch": 0.9996119518820333, "step": 1288}, {"loss": 1.7518, "grad_norm": 0.3282551169395447, "learning_rate": 0.0002, "epoch": 1.0011641443538999, "step": 1290}, {"loss": 1.6806, "grad_norm": 0.30217495560646057, "learning_rate": 0.0002, "epoch": 1.0089251067132325, "step": 1300}, {"loss": 1.6777, "grad_norm": 0.30801767110824585, "learning_rate": 0.0002, "epoch": 1.016686069072565, "step": 1310}, {"loss": 1.7756, "grad_norm": 0.31816792488098145, "learning_rate": 0.0002, "epoch": 1.0244470314318976, "step": 1320}, {"loss": 1.6986, "grad_norm": 0.27794334292411804, "learning_rate": 0.0002, "epoch": 1.03220799379123, "step": 1330}, {"loss": 1.6931, "grad_norm": 0.3018926680088043, "learning_rate": 0.0002, "epoch": 1.0399689561505627, "step": 1340}, {"loss": 1.7033, "grad_norm": 0.3552975356578827, "learning_rate": 0.0002, "epoch": 1.0477299185098952, "step": 1350}, {"loss": 1.6782, "grad_norm": 0.32590144872665405, "learning_rate": 0.0002, "epoch": 1.0554908808692278, "step": 1360}, {"loss": 1.6479, "grad_norm": 0.3435460925102234, "learning_rate": 0.0002, "epoch": 1.0632518432285603, "step": 1370}, {"loss": 1.7451, "grad_norm": 0.35037797689437866, "learning_rate": 0.0002, "epoch": 1.071012805587893, "step": 1380}, {"loss": 1.7868, "grad_norm": 0.31398263573646545, "learning_rate": 0.0002, "epoch": 1.0787737679472253, "step": 1390}, {"loss": 1.6729, "grad_norm": 0.3134010434150696, "learning_rate": 0.0002, "epoch": 1.086534730306558, "step": 1400}, {"loss": 1.751, "grad_norm": 0.4599704444408417, "learning_rate": 0.0002, "epoch": 1.0942956926658907, "step": 1410}, {"loss": 1.6871, "grad_norm": 0.35852891206741333, "learning_rate": 0.0002, "epoch": 1.102056655025223, "step": 1420}, {"loss": 1.7083, "grad_norm": 0.35628634691238403, "learning_rate": 0.0002, "epoch": 1.1098176173845558, "step": 1430}, {"loss": 1.6166, "grad_norm": 0.3769161105155945, "learning_rate": 0.0002, "epoch": 1.1175785797438882, "step": 1440}, {"loss": 1.7344, "grad_norm": 1.3712416887283325, "learning_rate": 0.0002, "epoch": 1.1253395421032208, "step": 1450}, {"loss": 1.6542, "grad_norm": 0.38406670093536377, "learning_rate": 0.0002, "epoch": 1.1331005044625533, "step": 1460}, {"loss": 1.7104, "grad_norm": 0.3402116000652313, "learning_rate": 0.0002, "epoch": 1.140861466821886, "step": 1470}, {"loss": 1.7074, "grad_norm": 0.341189444065094, "learning_rate": 0.0002, "epoch": 1.1486224291812184, "step": 1480}, {"loss": 1.6468, "grad_norm": 0.36629995703697205, "learning_rate": 0.0002, "epoch": 1.156383391540551, "step": 1490}, {"loss": 1.6952, "grad_norm": 0.3499569296836853, "learning_rate": 0.0002, "epoch": 1.1641443538998835, "step": 1500}, {"loss": 1.6625, "grad_norm": 0.3663063943386078, "learning_rate": 0.0002, "epoch": 1.1719053162592161, "step": 1510}, {"loss": 1.7533, "grad_norm": 0.34851500391960144, "learning_rate": 0.0002, "epoch": 1.1796662786185488, "step": 1520}, {"loss": 1.6092, "grad_norm": 0.35071656107902527, "learning_rate": 0.0002, "epoch": 1.1874272409778812, "step": 1530}, {"loss": 1.7206, "grad_norm": 0.42783796787261963, "learning_rate": 0.0002, "epoch": 1.1951882033372139, "step": 1540}, {"loss": 1.7499, "grad_norm": 0.31830692291259766, "learning_rate": 0.0002, "epoch": 1.2029491656965463, "step": 1550}, {"loss": 1.7372, "grad_norm": 0.3597424626350403, "learning_rate": 0.0002, "epoch": 1.210710128055879, "step": 1560}, {"loss": 1.6386, "grad_norm": 0.35233765840530396, "learning_rate": 0.0002, "epoch": 1.2184710904152114, "step": 1570}, {"loss": 1.6766, "grad_norm": 0.35942912101745605, "learning_rate": 0.0002, "epoch": 1.226232052774544, "step": 1580}, {"loss": 1.6598, "grad_norm": 0.36159393191337585, "learning_rate": 0.0002, "epoch": 1.2339930151338767, "step": 1590}, {"loss": 1.6697, "grad_norm": 0.3328469693660736, "learning_rate": 0.0002, "epoch": 1.2417539774932091, "step": 1600}, {"loss": 1.7594, "grad_norm": 0.3089476525783539, "learning_rate": 0.0002, "epoch": 1.2495149398525418, "step": 1610}, {"loss": 1.6805, "grad_norm": 0.30947765707969666, "learning_rate": 0.0002, "epoch": 1.2572759022118742, "step": 1620}, {"loss": 1.6899, "grad_norm": 0.32154011726379395, "learning_rate": 0.0002, "epoch": 1.265036864571207, "step": 1630}, {"loss": 1.6621, "grad_norm": 0.3480297923088074, "learning_rate": 0.0002, "epoch": 1.2727978269305393, "step": 1640}, {"loss": 1.7087, "grad_norm": 0.39471694827079773, "learning_rate": 0.0002, "epoch": 1.280558789289872, "step": 1650}, {"loss": 1.7608, "grad_norm": 0.35728853940963745, "learning_rate": 0.0002, "epoch": 1.2883197516492044, "step": 1660}, {"loss": 1.7008, "grad_norm": 0.35223081707954407, "learning_rate": 0.0002, "epoch": 1.296080714008537, "step": 1670}, {"loss": 1.7253, "grad_norm": 0.3588867485523224, "learning_rate": 0.0002, "epoch": 1.3038416763678695, "step": 1680}, {"loss": 1.6505, "grad_norm": 0.3528042733669281, "learning_rate": 0.0002, "epoch": 1.3116026387272022, "step": 1690}, {"loss": 1.6945, "grad_norm": 0.35975801944732666, "learning_rate": 0.0002, "epoch": 1.3193636010865348, "step": 1700}, {"loss": 1.6631, "grad_norm": 0.36691880226135254, "learning_rate": 0.0002, "epoch": 1.3271245634458673, "step": 1710}, {"loss": 1.7593, "grad_norm": 0.3787977695465088, "learning_rate": 0.0002, "epoch": 1.3348855258052, "step": 1720}, {"loss": 1.7697, "grad_norm": 0.36614933609962463, "learning_rate": 0.0002, "epoch": 1.3426464881645324, "step": 1730}, {"loss": 1.6487, "grad_norm": 0.3484745919704437, "learning_rate": 0.0002, "epoch": 1.350407450523865, "step": 1740}, {"loss": 1.7054, "grad_norm": 0.36905673146247864, "learning_rate": 0.0002, "epoch": 1.3581684128831975, "step": 1750}, {"loss": 1.7679, "grad_norm": 0.41564738750457764, "learning_rate": 0.0002, "epoch": 1.36592937524253, "step": 1760}, {"loss": 1.6634, "grad_norm": 0.3345205783843994, "learning_rate": 0.0002, "epoch": 1.3736903376018628, "step": 1770}, {"loss": 1.7275, "grad_norm": 0.34926071763038635, "learning_rate": 0.0002, "epoch": 1.3814512999611952, "step": 1780}, {"loss": 1.685, "grad_norm": 0.42004233598709106, "learning_rate": 0.0002, "epoch": 1.3892122623205276, "step": 1790}, {"loss": 1.666, "grad_norm": 0.3576236963272095, "learning_rate": 0.0002, "epoch": 1.3969732246798603, "step": 1800}, {"loss": 1.8516, "grad_norm": 0.3586704432964325, "learning_rate": 0.0002, "epoch": 1.404734187039193, "step": 1810}, {"loss": 1.6171, "grad_norm": 0.3943439722061157, "learning_rate": 0.0002, "epoch": 1.4124951493985254, "step": 1820}, {"loss": 1.6865, "grad_norm": 0.3484877049922943, "learning_rate": 0.0002, "epoch": 1.420256111757858, "step": 1830}, {"loss": 1.7205, "grad_norm": 0.3344518840312958, "learning_rate": 0.0002, "epoch": 1.4280170741171905, "step": 1840}, {"loss": 1.6999, "grad_norm": 0.4345698356628418, "learning_rate": 0.0002, "epoch": 1.4357780364765231, "step": 1850}, {"loss": 1.6855, "grad_norm": 0.5525162220001221, "learning_rate": 0.0002, "epoch": 1.4435389988358556, "step": 1860}, {"loss": 1.7143, "grad_norm": 0.37194496393203735, "learning_rate": 0.0002, "epoch": 1.4512999611951882, "step": 1870}, {"loss": 1.7623, "grad_norm": 0.34570157527923584, "learning_rate": 0.0002, "epoch": 1.4590609235545209, "step": 1880}, {"loss": 1.7, "grad_norm": 0.3512282073497772, "learning_rate": 0.0002, "epoch": 1.4668218859138533, "step": 1890}, {"loss": 1.7225, "grad_norm": 0.3443922996520996, "learning_rate": 0.0002, "epoch": 1.4745828482731858, "step": 1900}, {"loss": 1.7393, "grad_norm": 0.3812018036842346, "learning_rate": 0.0002, "epoch": 1.4823438106325184, "step": 1910}, {"loss": 1.7277, "grad_norm": 0.39263492822647095, "learning_rate": 0.0002, "epoch": 1.490104772991851, "step": 1920}, {"loss": 1.6829, "grad_norm": 0.3146156072616577, "learning_rate": 0.0002, "epoch": 1.4978657353511835, "step": 1930}, {"loss": 1.6881, "grad_norm": 0.3653988540172577, "learning_rate": 0.0002, "epoch": 1.505626697710516, "step": 1940}, {"loss": 1.7064, "grad_norm": 0.3966596722602844, "learning_rate": 0.0002, "epoch": 1.5133876600698488, "step": 1950}, {"loss": 1.6942, "grad_norm": 0.3441697359085083, "learning_rate": 0.0002, "epoch": 1.5211486224291813, "step": 1960}, {"loss": 1.7175, "grad_norm": 0.3328564465045929, "learning_rate": 0.0002, "epoch": 1.5289095847885137, "step": 1970}, {"loss": 1.7394, "grad_norm": 0.34068772196769714, "learning_rate": 0.0002, "epoch": 1.5366705471478463, "step": 1980}, {"loss": 1.7016, "grad_norm": 0.3559795916080475, "learning_rate": 0.0002, "epoch": 1.544431509507179, "step": 1990}, {"loss": 1.7102, "grad_norm": 0.37888768315315247, "learning_rate": 0.0002, "epoch": 1.5521924718665114, "step": 2000}, {"loss": 1.7094, "grad_norm": 0.36128363013267517, "learning_rate": 0.0002, "epoch": 1.5599534342258439, "step": 2010}, {"loss": 1.6407, "grad_norm": 0.3643714487552643, "learning_rate": 0.0002, "epoch": 1.5677143965851765, "step": 2020}, {"loss": 1.6777, "grad_norm": 0.3863612115383148, "learning_rate": 0.0002, "epoch": 1.5754753589445092, "step": 2030}, {"loss": 1.6575, "grad_norm": 0.32831457257270813, "learning_rate": 0.0002, "epoch": 1.5832363213038416, "step": 2040}, {"loss": 1.7404, "grad_norm": 0.36098113656044006, "learning_rate": 0.0002, "epoch": 1.5909972836631743, "step": 2050}, {"loss": 1.7065, "grad_norm": 1.1079334020614624, "learning_rate": 0.0002, "epoch": 1.598758246022507, "step": 2060}, {"loss": 1.6824, "grad_norm": 0.35615381598472595, "learning_rate": 0.0002, "epoch": 1.6065192083818394, "step": 2070}, {"loss": 1.7262, "grad_norm": 0.369711309671402, "learning_rate": 0.0002, "epoch": 1.6142801707411718, "step": 2080}, {"loss": 1.6995, "grad_norm": 0.390658438205719, "learning_rate": 0.0002, "epoch": 1.6220411331005045, "step": 2090}, {"loss": 1.6996, "grad_norm": 0.3422999382019043, "learning_rate": 0.0002, "epoch": 1.6298020954598371, "step": 2100}, {"loss": 1.7135, "grad_norm": 0.372475266456604, "learning_rate": 0.0002, "epoch": 1.6375630578191696, "step": 2110}, {"loss": 1.7216, "grad_norm": 0.35660576820373535, "learning_rate": 0.0002, "epoch": 1.645324020178502, "step": 2120}, {"loss": 1.6991, "grad_norm": 0.35754942893981934, "learning_rate": 0.0002, "epoch": 1.6530849825378346, "step": 2130}, {"loss": 1.6779, "grad_norm": 0.34572410583496094, "learning_rate": 0.0002, "epoch": 1.6608459448971673, "step": 2140}, {"loss": 1.6707, "grad_norm": 0.42059701681137085, "learning_rate": 0.0002, "epoch": 1.6686069072564997, "step": 2150}, {"loss": 1.6782, "grad_norm": 0.35200759768486023, "learning_rate": 0.0002, "epoch": 1.6763678696158324, "step": 2160}, {"loss": 1.6869, "grad_norm": 0.3704029321670532, "learning_rate": 0.0002, "epoch": 1.684128831975165, "step": 2170}, {"loss": 1.7192, "grad_norm": 0.40450501441955566, "learning_rate": 0.0002, "epoch": 1.6918897943344975, "step": 2180}, {"loss": 1.6228, "grad_norm": 0.362966924905777, "learning_rate": 0.0002, "epoch": 1.69965075669383, "step": 2190}, {"loss": 1.6935, "grad_norm": 0.36586204171180725, "learning_rate": 0.0002, "epoch": 1.7074117190531626, "step": 2200}, {"loss": 1.6088, "grad_norm": 0.3295372426509857, "learning_rate": 0.0002, "epoch": 1.7151726814124952, "step": 2210}, {"loss": 1.7844, "grad_norm": 0.3892575800418854, "learning_rate": 0.0002, "epoch": 1.7229336437718277, "step": 2220}, {"loss": 1.7805, "grad_norm": 0.34712135791778564, "learning_rate": 0.0002, "epoch": 1.73069460613116, "step": 2230}, {"loss": 1.7353, "grad_norm": 0.34801796078681946, "learning_rate": 0.0002, "epoch": 1.738455568490493, "step": 2240}, {"loss": 1.7009, "grad_norm": 0.3822397291660309, "learning_rate": 0.0002, "epoch": 1.7462165308498254, "step": 2250}, {"loss": 1.6546, "grad_norm": 0.38933250308036804, "learning_rate": 0.0002, "epoch": 1.7539774932091579, "step": 2260}, {"loss": 1.7245, "grad_norm": 0.3798373341560364, "learning_rate": 0.0002, "epoch": 1.7617384555684905, "step": 2270}, {"loss": 1.6508, "grad_norm": 0.35151317715644836, "learning_rate": 0.0002, "epoch": 1.7694994179278232, "step": 2280}, {"loss": 1.6894, "grad_norm": 0.44981494545936584, "learning_rate": 0.0002, "epoch": 1.7772603802871556, "step": 2290}, {"loss": 1.7271, "grad_norm": 0.3992624580860138, "learning_rate": 0.0002, "epoch": 1.785021342646488, "step": 2300}, {"loss": 1.7252, "grad_norm": 0.3772512376308441, "learning_rate": 0.0002, "epoch": 1.7927823050058207, "step": 2310}, {"loss": 1.7057, "grad_norm": 0.3511589467525482, "learning_rate": 0.0002, "epoch": 1.8005432673651534, "step": 2320}, {"loss": 1.764, "grad_norm": 0.3805285394191742, "learning_rate": 0.0002, "epoch": 1.8083042297244858, "step": 2330}, {"loss": 1.6986, "grad_norm": 0.3792071044445038, "learning_rate": 0.0002, "epoch": 1.8160651920838184, "step": 2340}, {"loss": 1.7759, "grad_norm": 0.36430829763412476, "learning_rate": 0.0002, "epoch": 1.823826154443151, "step": 2350}, {"loss": 1.6773, "grad_norm": 0.36502477526664734, "learning_rate": 0.0002, "epoch": 1.8315871168024835, "step": 2360}, {"loss": 1.8072, "grad_norm": 0.35015153884887695, "learning_rate": 0.0002, "epoch": 1.839348079161816, "step": 2370}, {"loss": 1.7734, "grad_norm": 0.3710903823375702, "learning_rate": 0.0002, "epoch": 1.8471090415211486, "step": 2380}, {"loss": 1.6737, "grad_norm": 0.3542828857898712, "learning_rate": 0.0002, "epoch": 1.8548700038804813, "step": 2390}, {"loss": 1.6783, "grad_norm": 0.35467568039894104, "learning_rate": 0.0002, "epoch": 1.8626309662398137, "step": 2400}, {"loss": 1.7773, "grad_norm": 0.3638560473918915, "learning_rate": 0.0002, "epoch": 1.8703919285991462, "step": 2410}, {"loss": 1.7019, "grad_norm": 0.3823298215866089, "learning_rate": 0.0002, "epoch": 1.8781528909584788, "step": 2420}, {"loss": 1.6935, "grad_norm": 0.3926416337490082, "learning_rate": 0.0002, "epoch": 1.8859138533178115, "step": 2430}, {"loss": 1.71, "grad_norm": 0.3608079254627228, "learning_rate": 0.0002, "epoch": 1.893674815677144, "step": 2440}, {"loss": 1.6654, "grad_norm": 0.3426613509654999, "learning_rate": 0.0002, "epoch": 1.9014357780364766, "step": 2450}, {"loss": 1.6892, "grad_norm": 0.3522338569164276, "learning_rate": 0.0002, "epoch": 1.9091967403958092, "step": 2460}, {"loss": 1.7307, "grad_norm": 0.3608049154281616, "learning_rate": 0.0002, "epoch": 1.9169577027551417, "step": 2470}, {"loss": 1.6823, "grad_norm": 0.3849755525588989, "learning_rate": 0.0002, "epoch": 1.924718665114474, "step": 2480}, {"loss": 1.7518, "grad_norm": 0.4154011011123657, "learning_rate": 0.0002, "epoch": 1.9324796274738067, "step": 2490}, {"loss": 1.7381, "grad_norm": 0.3602796792984009, "learning_rate": 0.0002, "epoch": 1.9402405898331394, "step": 2500}, {"loss": 1.7843, "grad_norm": 0.3702992796897888, "learning_rate": 0.0002, "epoch": 1.9480015521924718, "step": 2510}, {"loss": 1.6669, "grad_norm": 0.3657735288143158, "learning_rate": 0.0002, "epoch": 1.9557625145518043, "step": 2520}, {"loss": 1.5964, "grad_norm": 0.41031739115715027, "learning_rate": 0.0002, "epoch": 1.963523476911137, "step": 2530}, {"loss": 1.6745, "grad_norm": 0.34578680992126465, "learning_rate": 0.0002, "epoch": 1.9712844392704696, "step": 2540}, {"loss": 1.723, "grad_norm": 0.3361521065235138, "learning_rate": 0.0002, "epoch": 1.979045401629802, "step": 2550}, {"loss": 1.6868, "grad_norm": 0.34342363476753235, "learning_rate": 0.0002, "epoch": 1.9868063639891347, "step": 2560}, {"loss": 1.6577, "grad_norm": 0.32954007387161255, "learning_rate": 0.0002, "epoch": 1.9945673263484673, "step": 2570}, {"eval_loss": 1.8068748712539673, "eval_runtime": 105.5885, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.606, "epoch": 2.0, "step": 2577}, {"loss": 1.634, "grad_norm": 0.336302250623703, "learning_rate": 0.0002, "epoch": 2.0023282887077998, "step": 2580}, {"loss": 1.612, "grad_norm": 0.3627048432826996, "learning_rate": 0.0002, "epoch": 2.010089251067132, "step": 2590}, {"loss": 1.4908, "grad_norm": 0.38406702876091003, "learning_rate": 0.0002, "epoch": 2.017850213426465, "step": 2600}, {"loss": 1.5368, "grad_norm": 0.5326781272888184, "learning_rate": 0.0002, "epoch": 2.0256111757857975, "step": 2610}, {"loss": 1.5727, "grad_norm": 0.4774554967880249, "learning_rate": 0.0002, "epoch": 2.03337213814513, "step": 2620}, {"loss": 1.5422, "grad_norm": 0.4251810312271118, "learning_rate": 0.0002, "epoch": 2.0411331005044624, "step": 2630}, {"loss": 1.5152, "grad_norm": 0.4693007171154022, "learning_rate": 0.0002, "epoch": 2.0488940628637953, "step": 2640}, {"loss": 1.6137, "grad_norm": 0.46371519565582275, "learning_rate": 0.0002, "epoch": 2.0566550252231277, "step": 2650}, {"loss": 1.6304, "grad_norm": 0.46652570366859436, "learning_rate": 0.0002, "epoch": 2.06441598758246, "step": 2660}, {"loss": 1.6022, "grad_norm": 0.45200315117836, "learning_rate": 0.0002, "epoch": 2.0721769499417926, "step": 2670}, {"loss": 1.5358, "grad_norm": 0.42905205488204956, "learning_rate": 0.0002, "epoch": 2.0799379123011255, "step": 2680}, {"loss": 1.5401, "grad_norm": 0.44509148597717285, "learning_rate": 0.0002, "epoch": 2.087698874660458, "step": 2690}, {"loss": 1.5303, "grad_norm": 0.4445319175720215, "learning_rate": 0.0002, "epoch": 2.0954598370197903, "step": 2700}, {"loss": 1.5701, "grad_norm": 0.46825504302978516, "learning_rate": 0.0002, "epoch": 2.103220799379123, "step": 2710}, {"loss": 1.5751, "grad_norm": 0.4623856842517853, "learning_rate": 0.0002, "epoch": 2.1109817617384556, "step": 2720}, {"loss": 1.5601, "grad_norm": 0.4833452105522156, "learning_rate": 0.0002, "epoch": 2.118742724097788, "step": 2730}, {"loss": 1.5997, "grad_norm": 0.4582686722278595, "learning_rate": 0.0002, "epoch": 2.1265036864571205, "step": 2740}, {"loss": 1.5801, "grad_norm": 0.47587934136390686, "learning_rate": 0.0002, "epoch": 2.1342646488164534, "step": 2750}, {"loss": 1.594, "grad_norm": 0.4602217972278595, "learning_rate": 0.0002, "epoch": 2.142025611175786, "step": 2760}, {"loss": 1.5271, "grad_norm": 0.47501352429389954, "learning_rate": 0.0002, "epoch": 2.1497865735351183, "step": 2770}, {"loss": 1.4862, "grad_norm": 0.5078499913215637, "learning_rate": 0.0002, "epoch": 2.1575475358944507, "step": 2780}, {"loss": 1.6236, "grad_norm": 0.497704416513443, "learning_rate": 0.0002, "epoch": 2.1653084982537836, "step": 2790}, {"loss": 1.5597, "grad_norm": 0.5435971617698669, "learning_rate": 0.0002, "epoch": 2.173069460613116, "step": 2800}, {"loss": 1.5926, "grad_norm": 0.5172356367111206, "learning_rate": 0.0002, "epoch": 2.1808304229724484, "step": 2810}, {"loss": 1.5202, "grad_norm": 0.44063422083854675, "learning_rate": 0.0002, "epoch": 2.1885913853317813, "step": 2820}, {"loss": 1.6041, "grad_norm": 0.5079569220542908, "learning_rate": 0.0002, "epoch": 2.1963523476911138, "step": 2830}, {"loss": 1.5915, "grad_norm": 0.45658132433891296, "learning_rate": 0.0002, "epoch": 2.204113310050446, "step": 2840}, {"loss": 1.5546, "grad_norm": 0.5103023648262024, "learning_rate": 0.0002, "epoch": 2.2118742724097786, "step": 2850}, {"loss": 1.6197, "grad_norm": 0.4882226288318634, "learning_rate": 0.0002, "epoch": 2.2196352347691115, "step": 2860}, {"loss": 1.5996, "grad_norm": 0.5087296962738037, "learning_rate": 0.0002, "epoch": 2.227396197128444, "step": 2870}, {"loss": 1.5451, "grad_norm": 0.45293712615966797, "learning_rate": 0.0002, "epoch": 2.2351571594877764, "step": 2880}, {"loss": 1.6214, "grad_norm": 0.5120379328727722, "learning_rate": 0.0002, "epoch": 2.242918121847109, "step": 2890}, {"loss": 1.5273, "grad_norm": 0.47126415371894836, "learning_rate": 0.0002, "epoch": 2.2506790842064417, "step": 2900}, {"loss": 1.612, "grad_norm": 0.44005846977233887, "learning_rate": 0.0002, "epoch": 2.258440046565774, "step": 2910}, {"loss": 1.6023, "grad_norm": 0.46476176381111145, "learning_rate": 0.0002, "epoch": 2.2662010089251066, "step": 2920}, {"loss": 1.6417, "grad_norm": 0.48051515221595764, "learning_rate": 0.0002, "epoch": 2.2739619712844394, "step": 2930}, {"loss": 1.587, "grad_norm": 0.480069637298584, "learning_rate": 0.0002, "epoch": 2.281722933643772, "step": 2940}, {"loss": 1.5747, "grad_norm": 0.5122102499008179, "learning_rate": 0.0002, "epoch": 2.2894838960031043, "step": 2950}, {"loss": 1.5183, "grad_norm": 0.48879891633987427, "learning_rate": 0.0002, "epoch": 2.2972448583624367, "step": 2960}, {"loss": 1.5483, "grad_norm": 0.4973136782646179, "learning_rate": 0.0002, "epoch": 2.3050058207217696, "step": 2970}, {"loss": 1.677, "grad_norm": 0.5522695183753967, "learning_rate": 0.0002, "epoch": 2.312766783081102, "step": 2980}, {"loss": 1.5946, "grad_norm": 0.5220217704772949, "learning_rate": 0.0002, "epoch": 2.3205277454404345, "step": 2990}, {"loss": 1.6299, "grad_norm": 0.4978662431240082, "learning_rate": 0.0002, "epoch": 2.328288707799767, "step": 3000}, {"loss": 1.5498, "grad_norm": 0.554053544998169, "learning_rate": 0.0002, "epoch": 2.3360496701591, "step": 3010}, {"loss": 1.5356, "grad_norm": 0.4703886806964874, "learning_rate": 0.0002, "epoch": 2.3438106325184322, "step": 3020}, {"loss": 1.5418, "grad_norm": 0.5074123740196228, "learning_rate": 0.0002, "epoch": 2.3515715948777647, "step": 3030}, {"loss": 1.6873, "grad_norm": 0.5088278651237488, "learning_rate": 0.0002, "epoch": 2.3593325572370976, "step": 3040}, {"loss": 1.5249, "grad_norm": 0.4752114415168762, "learning_rate": 0.0002, "epoch": 2.36709351959643, "step": 3050}, {"loss": 1.5353, "grad_norm": 0.5121659636497498, "learning_rate": 0.0002, "epoch": 2.3748544819557624, "step": 3060}, {"loss": 1.6426, "grad_norm": 0.48649218678474426, "learning_rate": 0.0002, "epoch": 2.3826154443150953, "step": 3070}, {"loss": 1.6136, "grad_norm": 0.5209488868713379, "learning_rate": 0.0002, "epoch": 2.3903764066744277, "step": 3080}, {"loss": 1.597, "grad_norm": 0.5110517740249634, "learning_rate": 0.0002, "epoch": 2.39813736903376, "step": 3090}, {"loss": 1.5773, "grad_norm": 0.5609337091445923, "learning_rate": 0.0002, "epoch": 2.4058983313930926, "step": 3100}, {"loss": 1.5438, "grad_norm": 0.5191826224327087, "learning_rate": 0.0002, "epoch": 2.4136592937524255, "step": 3110}, {"loss": 1.6347, "grad_norm": 0.4876069724559784, "learning_rate": 0.0002, "epoch": 2.421420256111758, "step": 3120}, {"loss": 1.5565, "grad_norm": 0.4713933765888214, "learning_rate": 0.0002, "epoch": 2.4291812184710904, "step": 3130}, {"loss": 1.6388, "grad_norm": 0.5102227330207825, "learning_rate": 0.0002, "epoch": 2.436942180830423, "step": 3140}, {"loss": 1.5667, "grad_norm": 0.44546666741371155, "learning_rate": 0.0002, "epoch": 2.4447031431897557, "step": 3150}, {"loss": 1.5973, "grad_norm": 0.5167558193206787, "learning_rate": 0.0002, "epoch": 2.452464105549088, "step": 3160}, {"loss": 1.5673, "grad_norm": 0.5226958990097046, "learning_rate": 0.0002, "epoch": 2.4602250679084205, "step": 3170}, {"loss": 1.5758, "grad_norm": 0.4751799702644348, "learning_rate": 0.0002, "epoch": 2.4679860302677534, "step": 3180}, {"loss": 1.6234, "grad_norm": 0.4744729697704315, "learning_rate": 0.0002, "epoch": 2.475746992627086, "step": 3190}, {"loss": 1.5661, "grad_norm": 0.5203230381011963, "learning_rate": 0.0002, "epoch": 2.4835079549864183, "step": 3200}, {"loss": 1.493, "grad_norm": 0.47209781408309937, "learning_rate": 0.0002, "epoch": 2.4912689173457507, "step": 3210}, {"loss": 1.6415, "grad_norm": 0.5241674780845642, "learning_rate": 0.0002, "epoch": 2.4990298797050836, "step": 3220}, {"loss": 1.6324, "grad_norm": 0.5152244567871094, "learning_rate": 0.0002, "epoch": 2.506790842064416, "step": 3230}, {"loss": 1.6248, "grad_norm": 0.5216741561889648, "learning_rate": 0.0002, "epoch": 2.5145518044237485, "step": 3240}, {"loss": 1.5668, "grad_norm": 0.4953259527683258, "learning_rate": 0.0002, "epoch": 2.522312766783081, "step": 3250}, {"loss": 1.666, "grad_norm": 0.5973829030990601, "learning_rate": 0.0002, "epoch": 2.530073729142414, "step": 3260}, {"loss": 1.5295, "grad_norm": 0.48804202675819397, "learning_rate": 0.0002, "epoch": 2.5378346915017462, "step": 3270}, {"loss": 1.4954, "grad_norm": 0.5334644317626953, "learning_rate": 0.0002, "epoch": 2.5455956538610787, "step": 3280}, {"loss": 1.5814, "grad_norm": 0.46873313188552856, "learning_rate": 0.0002, "epoch": 2.5533566162204115, "step": 3290}, {"loss": 1.5362, "grad_norm": 0.4282589554786682, "learning_rate": 0.0002, "epoch": 2.561117578579744, "step": 3300}, {"loss": 1.6278, "grad_norm": 0.4848293960094452, "learning_rate": 0.0002, "epoch": 2.5688785409390764, "step": 3310}, {"loss": 1.6308, "grad_norm": 0.5093745589256287, "learning_rate": 0.0002, "epoch": 2.576639503298409, "step": 3320}, {"loss": 1.6375, "grad_norm": 0.5084842443466187, "learning_rate": 0.0002, "epoch": 2.5844004656577413, "step": 3330}, {"loss": 1.6168, "grad_norm": 0.4696281850337982, "learning_rate": 0.0002, "epoch": 2.592161428017074, "step": 3340}, {"loss": 1.5359, "grad_norm": 0.5767765641212463, "learning_rate": 0.0002, "epoch": 2.5999223903764066, "step": 3350}, {"loss": 1.6097, "grad_norm": 0.47300875186920166, "learning_rate": 0.0002, "epoch": 2.607683352735739, "step": 3360}, {"loss": 1.6138, "grad_norm": 0.4809158146381378, "learning_rate": 0.0002, "epoch": 2.615444315095072, "step": 3370}, {"loss": 1.4952, "grad_norm": 0.5141063928604126, "learning_rate": 0.0002, "epoch": 2.6232052774544043, "step": 3380}, {"loss": 1.5784, "grad_norm": 0.4832935035228729, "learning_rate": 0.0002, "epoch": 2.630966239813737, "step": 3390}, {"loss": 1.5796, "grad_norm": 0.5044625401496887, "learning_rate": 0.0002, "epoch": 2.6387272021730697, "step": 3400}, {"loss": 1.6202, "grad_norm": 0.5287680625915527, "learning_rate": 0.0002, "epoch": 2.646488164532402, "step": 3410}, {"loss": 1.5423, "grad_norm": 0.5306379795074463, "learning_rate": 0.0002, "epoch": 2.6542491268917345, "step": 3420}, {"loss": 1.5264, "grad_norm": 0.5849291682243347, "learning_rate": 0.0002, "epoch": 2.662010089251067, "step": 3430}, {"loss": 1.5937, "grad_norm": 0.7951080799102783, "learning_rate": 0.0002, "epoch": 2.6697710516104, "step": 3440}, {"loss": 1.5791, "grad_norm": 0.48087653517723083, "learning_rate": 0.0002, "epoch": 2.6775320139697323, "step": 3450}, {"loss": 1.6769, "grad_norm": 0.5396431684494019, "learning_rate": 0.0002, "epoch": 2.6852929763290647, "step": 3460}, {"loss": 1.606, "grad_norm": 0.5481634736061096, "learning_rate": 0.0002, "epoch": 2.693053938688397, "step": 3470}, {"loss": 1.6436, "grad_norm": 0.5068731307983398, "learning_rate": 0.0002, "epoch": 2.70081490104773, "step": 3480}, {"loss": 1.5738, "grad_norm": 0.5759826898574829, "learning_rate": 0.0002, "epoch": 2.7085758634070625, "step": 3490}, {"loss": 1.596, "grad_norm": 0.7253932952880859, "learning_rate": 0.0002, "epoch": 2.716336825766395, "step": 3500}, {"loss": 1.5791, "grad_norm": 0.527745246887207, "learning_rate": 0.0002, "epoch": 2.724097788125728, "step": 3510}, {"loss": 1.5874, "grad_norm": 0.5279242396354675, "learning_rate": 0.0002, "epoch": 2.73185875048506, "step": 3520}, {"loss": 1.6768, "grad_norm": 0.5047839283943176, "learning_rate": 0.0002, "epoch": 2.7396197128443927, "step": 3530}, {"loss": 1.5517, "grad_norm": 0.5430883169174194, "learning_rate": 0.0002, "epoch": 2.7473806752037255, "step": 3540}, {"loss": 1.5624, "grad_norm": 0.4496723711490631, "learning_rate": 0.0002, "epoch": 2.755141637563058, "step": 3550}, {"loss": 1.5789, "grad_norm": 0.5063338875770569, "learning_rate": 0.0002, "epoch": 2.7629025999223904, "step": 3560}, {"loss": 1.52, "grad_norm": 0.4619026780128479, "learning_rate": 0.0002, "epoch": 2.770663562281723, "step": 3570}, {"loss": 1.5793, "grad_norm": 0.4753304123878479, "learning_rate": 0.0002, "epoch": 2.7784245246410553, "step": 3580}, {"loss": 1.5715, "grad_norm": 0.5422708988189697, "learning_rate": 0.0002, "epoch": 2.786185487000388, "step": 3590}, {"loss": 1.5926, "grad_norm": 0.4756578803062439, "learning_rate": 0.0002, "epoch": 2.7939464493597206, "step": 3600}, {"loss": 1.5358, "grad_norm": 0.5057567358016968, "learning_rate": 0.0002, "epoch": 2.801707411719053, "step": 3610}, {"loss": 1.6131, "grad_norm": 0.5410919785499573, "learning_rate": 0.0002, "epoch": 2.809468374078386, "step": 3620}, {"loss": 1.5573, "grad_norm": 0.4958136975765228, "learning_rate": 0.0002, "epoch": 2.8172293364377183, "step": 3630}, {"loss": 1.6324, "grad_norm": 0.454527348279953, "learning_rate": 0.0002, "epoch": 2.8249902987970508, "step": 3640}, {"loss": 1.5582, "grad_norm": 0.5092706084251404, "learning_rate": 0.0002, "epoch": 2.8327512611563836, "step": 3650}, {"loss": 1.5893, "grad_norm": 0.5314022302627563, "learning_rate": 0.0002, "epoch": 2.840512223515716, "step": 3660}, {"loss": 1.588, "grad_norm": 0.5028239488601685, "learning_rate": 0.0002, "epoch": 2.8482731858750485, "step": 3670}, {"loss": 1.5751, "grad_norm": 0.5127444863319397, "learning_rate": 0.0002, "epoch": 2.856034148234381, "step": 3680}, {"loss": 1.6018, "grad_norm": 0.5045645236968994, "learning_rate": 0.0002, "epoch": 2.8637951105937134, "step": 3690}, {"loss": 1.5788, "grad_norm": 0.5560781955718994, "learning_rate": 0.0002, "epoch": 2.8715560729530463, "step": 3700}, {"loss": 1.5988, "grad_norm": 0.5177600383758545, "learning_rate": 0.0002, "epoch": 2.8793170353123787, "step": 3710}, {"loss": 1.6009, "grad_norm": 0.45830899477005005, "learning_rate": 0.0002, "epoch": 2.887077997671711, "step": 3720}, {"loss": 1.6344, "grad_norm": 0.4828629195690155, "learning_rate": 0.0002, "epoch": 2.894838960031044, "step": 3730}, {"loss": 1.6758, "grad_norm": 0.48241183161735535, "learning_rate": 0.0002, "epoch": 2.9025999223903765, "step": 3740}, {"loss": 1.5649, "grad_norm": 0.4909592568874359, "learning_rate": 0.0002, "epoch": 2.910360884749709, "step": 3750}, {"loss": 1.4927, "grad_norm": 0.44677025079727173, "learning_rate": 0.0002, "epoch": 2.9181218471090418, "step": 3760}, {"loss": 1.5067, "grad_norm": 0.4928834140300751, "learning_rate": 0.0002, "epoch": 2.925882809468374, "step": 3770}, {"loss": 1.5843, "grad_norm": 0.5673553347587585, "learning_rate": 0.0002, "epoch": 2.9336437718277066, "step": 3780}, {"loss": 1.5566, "grad_norm": 0.548190712928772, "learning_rate": 0.0002, "epoch": 2.941404734187039, "step": 3790}, {"loss": 1.5892, "grad_norm": 0.48979803919792175, "learning_rate": 0.0002, "epoch": 2.9491656965463715, "step": 3800}, {"loss": 1.5589, "grad_norm": 0.533191978931427, "learning_rate": 0.0002, "epoch": 2.9569266589057044, "step": 3810}, {"loss": 1.584, "grad_norm": 0.5362946391105652, "learning_rate": 0.0002, "epoch": 2.964687621265037, "step": 3820}, {"loss": 1.6602, "grad_norm": 0.4724906384944916, "learning_rate": 0.0002, "epoch": 2.9724485836243693, "step": 3830}, {"loss": 1.5834, "grad_norm": 0.5468461513519287, "learning_rate": 0.0002, "epoch": 2.980209545983702, "step": 3840}, {"loss": 1.6316, "grad_norm": 0.4697108864784241, "learning_rate": 0.0002, "epoch": 2.9879705083430346, "step": 3850}, {"loss": 1.6312, "grad_norm": 0.4780906140804291, "learning_rate": 0.0002, "epoch": 2.995731470702367, "step": 3860}, {"eval_loss": 1.8472607135772705, "eval_runtime": 106.5541, "eval_samples_per_second": 4.758, "eval_steps_per_second": 0.601, "epoch": 2.9996119518820334, "step": 3865}, {"loss": 1.4983, "grad_norm": 0.5645653605461121, "learning_rate": 0.0002, "epoch": 3.0034924330616994, "step": 3870}, {"loss": 1.4334, "grad_norm": 0.6457151174545288, "learning_rate": 0.0002, "epoch": 3.0112533954210323, "step": 3880}, {"loss": 1.3899, "grad_norm": 0.583838164806366, "learning_rate": 0.0002, "epoch": 3.0190143577803648, "step": 3890}, {"loss": 1.3258, "grad_norm": 0.6819260120391846, "learning_rate": 0.0002, "epoch": 3.026775320139697, "step": 3900}, {"loss": 1.3458, "grad_norm": 0.6692903637886047, "learning_rate": 0.0002, "epoch": 3.03453628249903, "step": 3910}, {"loss": 1.4356, "grad_norm": 0.6101024746894836, "learning_rate": 0.0002, "epoch": 3.0422972448583625, "step": 3920}, {"loss": 1.394, "grad_norm": 0.7014093399047852, "learning_rate": 0.0002, "epoch": 3.050058207217695, "step": 3930}, {"loss": 1.3885, "grad_norm": 0.7380381226539612, "learning_rate": 0.0002, "epoch": 3.0578191695770274, "step": 3940}, {"loss": 1.4206, "grad_norm": 0.6607900857925415, "learning_rate": 0.0002, "epoch": 3.0655801319363603, "step": 3950}, {"loss": 1.4293, "grad_norm": 0.735263466835022, "learning_rate": 0.0002, "epoch": 3.0733410942956927, "step": 3960}, {"loss": 1.3966, "grad_norm": 0.6788513660430908, "learning_rate": 0.0002, "epoch": 3.081102056655025, "step": 3970}, {"loss": 1.3435, "grad_norm": 0.6347652673721313, "learning_rate": 0.0002, "epoch": 3.088863019014358, "step": 3980}, {"loss": 1.4518, "grad_norm": 0.7056642770767212, "learning_rate": 0.0002, "epoch": 3.0966239813736904, "step": 3990}, {"loss": 1.4474, "grad_norm": 0.6387075185775757, "learning_rate": 0.0002, "epoch": 3.104384943733023, "step": 4000}, {"loss": 1.3833, "grad_norm": 0.6701116561889648, "learning_rate": 0.0002, "epoch": 3.1121459060923553, "step": 4010}, {"loss": 1.404, "grad_norm": 0.7558449506759644, "learning_rate": 0.0002, "epoch": 3.119906868451688, "step": 4020}, {"loss": 1.3294, "grad_norm": 0.6612881422042847, "learning_rate": 0.0002, "epoch": 3.1276678308110206, "step": 4030}, {"loss": 1.439, "grad_norm": 0.7474587559700012, "learning_rate": 0.0002, "epoch": 3.135428793170353, "step": 4040}, {"loss": 1.4616, "grad_norm": 0.7292373776435852, "learning_rate": 0.0002, "epoch": 3.1431897555296855, "step": 4050}, {"loss": 1.3908, "grad_norm": 0.7432886958122253, "learning_rate": 0.0002, "epoch": 3.1509507178890184, "step": 4060}, {"loss": 1.4214, "grad_norm": 0.6366098523139954, "learning_rate": 0.0002, "epoch": 3.158711680248351, "step": 4070}, {"loss": 1.5044, "grad_norm": 0.6837611794471741, "learning_rate": 0.0002, "epoch": 3.1664726426076832, "step": 4080}, {"loss": 1.4332, "grad_norm": 0.7194393277168274, "learning_rate": 0.0002, "epoch": 3.174233604967016, "step": 4090}, {"loss": 1.3628, "grad_norm": 0.6963607668876648, "learning_rate": 0.0002, "epoch": 3.1819945673263486, "step": 4100}, {"loss": 1.4127, "grad_norm": 0.6404902935028076, "learning_rate": 0.0002, "epoch": 3.189755529685681, "step": 4110}, {"loss": 1.4394, "grad_norm": 0.7172070741653442, "learning_rate": 0.0002, "epoch": 3.1975164920450134, "step": 4120}, {"loss": 1.4658, "grad_norm": 0.6577759385108948, "learning_rate": 0.0002, "epoch": 3.2052774544043463, "step": 4130}, {"loss": 1.4019, "grad_norm": 0.6658480167388916, "learning_rate": 0.0002, "epoch": 3.2130384167636787, "step": 4140}, {"loss": 1.4348, "grad_norm": 0.6771699786186218, "learning_rate": 0.0002, "epoch": 3.220799379123011, "step": 4150}, {"loss": 1.4736, "grad_norm": 0.699035108089447, "learning_rate": 0.0002, "epoch": 3.2285603414823436, "step": 4160}, {"loss": 1.4096, "grad_norm": 0.7218514680862427, "learning_rate": 0.0002, "epoch": 3.2363213038416765, "step": 4170}, {"loss": 1.3637, "grad_norm": 0.6270631551742554, "learning_rate": 0.0002, "epoch": 3.244082266201009, "step": 4180}, {"loss": 1.4076, "grad_norm": 0.6828921437263489, "learning_rate": 0.0002, "epoch": 3.2518432285603414, "step": 4190}, {"loss": 1.4663, "grad_norm": 0.6005498170852661, "learning_rate": 0.0002, "epoch": 3.2596041909196742, "step": 4200}, {"loss": 1.4798, "grad_norm": 0.6974790692329407, "learning_rate": 0.0002, "epoch": 3.2673651532790067, "step": 4210}, {"loss": 1.5012, "grad_norm": 0.7269543409347534, "learning_rate": 0.0002, "epoch": 3.275126115638339, "step": 4220}, {"loss": 1.3848, "grad_norm": 0.6728787422180176, "learning_rate": 0.0002, "epoch": 3.2828870779976715, "step": 4230}, {"loss": 1.4112, "grad_norm": 0.676972508430481, "learning_rate": 0.0002, "epoch": 3.2906480403570044, "step": 4240}, {"loss": 1.4206, "grad_norm": 0.748309314250946, "learning_rate": 0.0002, "epoch": 3.298409002716337, "step": 4250}, {"loss": 1.4973, "grad_norm": 0.6976589560508728, "learning_rate": 0.0002, "epoch": 3.3061699650756693, "step": 4260}, {"loss": 1.3967, "grad_norm": 0.649780809879303, "learning_rate": 0.0002, "epoch": 3.3139309274350017, "step": 4270}, {"loss": 1.327, "grad_norm": 0.6529902815818787, "learning_rate": 0.0002, "epoch": 3.3216918897943346, "step": 4280}, {"loss": 1.4888, "grad_norm": 0.9273163676261902, "learning_rate": 0.0002, "epoch": 3.329452852153667, "step": 4290}, {"loss": 1.4859, "grad_norm": 0.717024028301239, "learning_rate": 0.0002, "epoch": 3.3372138145129995, "step": 4300}, {"loss": 1.4441, "grad_norm": 0.7914950251579285, "learning_rate": 0.0002, "epoch": 3.3449747768723324, "step": 4310}, {"loss": 1.432, "grad_norm": 0.7133203148841858, "learning_rate": 0.0002, "epoch": 3.352735739231665, "step": 4320}, {"loss": 1.4662, "grad_norm": 0.7409568428993225, "learning_rate": 0.0002, "epoch": 3.3604967015909972, "step": 4330}, {"loss": 1.3992, "grad_norm": 0.6993981003761292, "learning_rate": 0.0002, "epoch": 3.3682576639503297, "step": 4340}, {"loss": 1.4261, "grad_norm": 0.7114535570144653, "learning_rate": 0.0002, "epoch": 3.3760186263096625, "step": 4350}, {"loss": 1.4227, "grad_norm": 0.6790860295295715, "learning_rate": 0.0002, "epoch": 3.383779588668995, "step": 4360}, {"loss": 1.4128, "grad_norm": 0.6507849097251892, "learning_rate": 0.0002, "epoch": 3.3915405510283274, "step": 4370}, {"loss": 1.4559, "grad_norm": 0.5967804193496704, "learning_rate": 0.0002, "epoch": 3.39930151338766, "step": 4380}, {"loss": 1.3687, "grad_norm": 0.6625847816467285, "learning_rate": 0.0002, "epoch": 3.4070624757469927, "step": 4390}, {"loss": 1.4193, "grad_norm": 0.6736508011817932, "learning_rate": 0.0002, "epoch": 3.414823438106325, "step": 4400}, {"loss": 1.4363, "grad_norm": 0.7870860695838928, "learning_rate": 0.0002, "epoch": 3.4225844004656576, "step": 4410}, {"loss": 1.4114, "grad_norm": 0.7205295562744141, "learning_rate": 0.0002, "epoch": 3.4303453628249905, "step": 4420}, {"loss": 1.4131, "grad_norm": 0.6634634137153625, "learning_rate": 0.0002, "epoch": 3.438106325184323, "step": 4430}, {"loss": 1.4683, "grad_norm": 0.7562733292579651, "learning_rate": 0.0002, "epoch": 3.4458672875436553, "step": 4440}, {"loss": 1.3486, "grad_norm": 0.6585879921913147, "learning_rate": 0.0002, "epoch": 3.453628249902988, "step": 4450}, {"loss": 1.4283, "grad_norm": 0.6896792054176331, "learning_rate": 0.0002, "epoch": 3.4613892122623207, "step": 4460}, {"loss": 1.4208, "grad_norm": 0.6520342230796814, "learning_rate": 0.0002, "epoch": 3.469150174621653, "step": 4470}, {"loss": 1.3423, "grad_norm": 0.6760806441307068, "learning_rate": 0.0002, "epoch": 3.4769111369809855, "step": 4480}, {"loss": 1.4398, "grad_norm": 0.7539774179458618, "learning_rate": 0.0002, "epoch": 3.484672099340318, "step": 4490}, {"loss": 1.4534, "grad_norm": 0.7409411668777466, "learning_rate": 0.0002, "epoch": 3.492433061699651, "step": 4500}, {"loss": 1.4069, "grad_norm": 0.6876253485679626, "learning_rate": 0.0002, "epoch": 3.5001940240589833, "step": 4510}, {"loss": 1.4228, "grad_norm": 0.7028461694717407, "learning_rate": 0.0002, "epoch": 3.5079549864183157, "step": 4520}, {"loss": 1.4723, "grad_norm": 0.8056529760360718, "learning_rate": 0.0002, "epoch": 3.5157159487776486, "step": 4530}, {"loss": 1.4148, "grad_norm": 0.711338996887207, "learning_rate": 0.0002, "epoch": 3.523476911136981, "step": 4540}, {"loss": 1.5247, "grad_norm": 0.7343552708625793, "learning_rate": 0.0002, "epoch": 3.5312378734963135, "step": 4550}, {"loss": 1.4308, "grad_norm": 0.745479941368103, "learning_rate": 0.0002, "epoch": 3.5389988358556463, "step": 4560}, {"loss": 1.4229, "grad_norm": 0.7582294940948486, "learning_rate": 0.0002, "epoch": 3.5467597982149788, "step": 4570}, {"loss": 1.4127, "grad_norm": 0.6717444658279419, "learning_rate": 0.0002, "epoch": 3.554520760574311, "step": 4580}, {"loss": 1.4368, "grad_norm": 0.7417883276939392, "learning_rate": 0.0002, "epoch": 3.5622817229336436, "step": 4590}, {"loss": 1.4176, "grad_norm": 0.6385737061500549, "learning_rate": 0.0002, "epoch": 3.570042685292976, "step": 4600}, {"loss": 1.3981, "grad_norm": 0.716704249382019, "learning_rate": 0.0002, "epoch": 3.577803647652309, "step": 4610}, {"loss": 1.3889, "grad_norm": 0.6948980093002319, "learning_rate": 0.0002, "epoch": 3.5855646100116414, "step": 4620}, {"loss": 1.5177, "grad_norm": 0.6961140036582947, "learning_rate": 0.0002, "epoch": 3.593325572370974, "step": 4630}, {"loss": 1.4508, "grad_norm": 0.7493122220039368, "learning_rate": 0.0002, "epoch": 3.6010865347303067, "step": 4640}, {"loss": 1.3987, "grad_norm": 0.7431658506393433, "learning_rate": 0.0002, "epoch": 3.608847497089639, "step": 4650}, {"loss": 1.4551, "grad_norm": 0.8353387713432312, "learning_rate": 0.0002, "epoch": 3.6166084594489716, "step": 4660}, {"loss": 1.4533, "grad_norm": 0.7095612287521362, "learning_rate": 0.0002, "epoch": 3.6243694218083045, "step": 4670}, {"loss": 1.4003, "grad_norm": 0.776620090007782, "learning_rate": 0.0002, "epoch": 3.632130384167637, "step": 4680}, {"loss": 1.4361, "grad_norm": 0.7198925018310547, "learning_rate": 0.0002, "epoch": 3.6398913465269693, "step": 4690}, {"loss": 1.4543, "grad_norm": 0.8238834738731384, "learning_rate": 0.0002, "epoch": 3.6476523088863018, "step": 4700}, {"loss": 1.3958, "grad_norm": 0.6804245710372925, "learning_rate": 0.0002, "epoch": 3.655413271245634, "step": 4710}, {"loss": 1.4158, "grad_norm": 0.8444845676422119, "learning_rate": 0.0002, "epoch": 3.663174233604967, "step": 4720}, {"loss": 1.3825, "grad_norm": 0.743797779083252, "learning_rate": 0.0002, "epoch": 3.6709351959642995, "step": 4730}, {"loss": 1.4213, "grad_norm": 0.8994188904762268, "learning_rate": 0.0002, "epoch": 3.678696158323632, "step": 4740}, {"loss": 1.4281, "grad_norm": 0.75416100025177, "learning_rate": 0.0002, "epoch": 3.686457120682965, "step": 4750}, {"loss": 1.4154, "grad_norm": 0.6499266028404236, "learning_rate": 0.0002, "epoch": 3.6942180830422973, "step": 4760}, {"loss": 1.4005, "grad_norm": 0.7246791124343872, "learning_rate": 0.0002, "epoch": 3.7019790454016297, "step": 4770}, {"loss": 1.426, "grad_norm": 0.7831124067306519, "learning_rate": 0.0002, "epoch": 3.7097400077609626, "step": 4780}, {"loss": 1.3933, "grad_norm": 0.7130028009414673, "learning_rate": 0.0002, "epoch": 3.717500970120295, "step": 4790}, {"loss": 1.4632, "grad_norm": 0.7501602172851562, "learning_rate": 0.0002, "epoch": 3.7252619324796274, "step": 4800}, {"loss": 1.4985, "grad_norm": 0.6980932950973511, "learning_rate": 0.0002, "epoch": 3.73302289483896, "step": 4810}, {"loss": 1.4517, "grad_norm": 0.8050530552864075, "learning_rate": 0.0002, "epoch": 3.7407838571982923, "step": 4820}, {"loss": 1.4703, "grad_norm": 0.6385579705238342, "learning_rate": 0.0002, "epoch": 3.748544819557625, "step": 4830}, {"loss": 1.5281, "grad_norm": 0.6664714813232422, "learning_rate": 0.0002, "epoch": 3.7563057819169576, "step": 4840}, {"loss": 1.4443, "grad_norm": 0.7125676274299622, "learning_rate": 0.0002, "epoch": 3.76406674427629, "step": 4850}, {"loss": 1.3958, "grad_norm": 0.7231866717338562, "learning_rate": 0.0002, "epoch": 3.771827706635623, "step": 4860}, {"loss": 1.4446, "grad_norm": 0.6917183995246887, "learning_rate": 0.0002, "epoch": 3.7795886689949554, "step": 4870}, {"loss": 1.4369, "grad_norm": 0.665037989616394, "learning_rate": 0.0002, "epoch": 3.787349631354288, "step": 4880}, {"loss": 1.4193, "grad_norm": 0.5837726593017578, "learning_rate": 0.0002, "epoch": 3.7951105937136207, "step": 4890}, {"loss": 1.4176, "grad_norm": 0.6366701722145081, "learning_rate": 0.0002, "epoch": 3.802871556072953, "step": 4900}, {"loss": 1.46, "grad_norm": 0.7082223892211914, "learning_rate": 0.0002, "epoch": 3.8106325184322856, "step": 4910}, {"loss": 1.5139, "grad_norm": 0.8101672530174255, "learning_rate": 0.0002, "epoch": 3.818393480791618, "step": 4920}, {"loss": 1.3659, "grad_norm": 0.7516148090362549, "learning_rate": 0.0002, "epoch": 3.826154443150951, "step": 4930}, {"loss": 1.3909, "grad_norm": 0.7928489446640015, "learning_rate": 0.0002, "epoch": 3.8339154055102833, "step": 4940}, {"loss": 1.4255, "grad_norm": 0.6892234683036804, "learning_rate": 0.0002, "epoch": 3.8416763678696157, "step": 4950}, {"loss": 1.5024, "grad_norm": 0.6381304264068604, "learning_rate": 0.0002, "epoch": 3.849437330228948, "step": 4960}, {"loss": 1.4873, "grad_norm": 0.8068831562995911, "learning_rate": 0.0002, "epoch": 3.857198292588281, "step": 4970}, {"loss": 1.45, "grad_norm": 0.7289869785308838, "learning_rate": 0.0002, "epoch": 3.8649592549476135, "step": 4980}, {"loss": 1.398, "grad_norm": 0.7278549075126648, "learning_rate": 0.0002, "epoch": 3.872720217306946, "step": 4990}, {"loss": 1.4442, "grad_norm": 0.7324236631393433, "learning_rate": 0.0002, "epoch": 3.880481179666279, "step": 5000}, {"loss": 1.4511, "grad_norm": 0.6759871244430542, "learning_rate": 0.0002, "epoch": 3.8882421420256112, "step": 5010}, {"loss": 1.4705, "grad_norm": 0.8159207701683044, "learning_rate": 0.0002, "epoch": 3.8960031043849437, "step": 5020}, {"loss": 1.4685, "grad_norm": 0.6536211967468262, "learning_rate": 0.0002, "epoch": 3.9037640667442766, "step": 5030}, {"loss": 1.4335, "grad_norm": 0.6827932000160217, "learning_rate": 0.0002, "epoch": 3.911525029103609, "step": 5040}, {"loss": 1.433, "grad_norm": 0.6688340306282043, "learning_rate": 0.0002, "epoch": 3.9192859914629414, "step": 5050}, {"loss": 1.4099, "grad_norm": 0.6385695934295654, "learning_rate": 0.0002, "epoch": 3.927046953822274, "step": 5060}, {"loss": 1.4767, "grad_norm": 0.6975107192993164, "learning_rate": 0.0002, "epoch": 3.9348079161816063, "step": 5070}, {"loss": 1.4893, "grad_norm": 0.6684112548828125, "learning_rate": 0.0002, "epoch": 3.942568878540939, "step": 5080}, {"loss": 1.4732, "grad_norm": 0.8349628448486328, "learning_rate": 0.0002, "epoch": 3.9503298409002716, "step": 5090}, {"loss": 1.5131, "grad_norm": 0.7146425843238831, "learning_rate": 0.0002, "epoch": 3.958090803259604, "step": 5100}, {"loss": 1.4149, "grad_norm": 0.6555036902427673, "learning_rate": 0.0002, "epoch": 3.965851765618937, "step": 5110}, {"loss": 1.4274, "grad_norm": 0.7037415504455566, "learning_rate": 0.0002, "epoch": 3.9736127279782694, "step": 5120}, {"loss": 1.4292, "grad_norm": 0.7235575914382935, "learning_rate": 0.0002, "epoch": 3.981373690337602, "step": 5130}, {"loss": 1.4455, "grad_norm": 0.7092325687408447, "learning_rate": 0.0002, "epoch": 3.9891346526969347, "step": 5140}, {"loss": 1.4512, "grad_norm": 0.7490319609642029, "learning_rate": 0.0002, "epoch": 3.996895615056267, "step": 5150}, {"eval_loss": 1.9131355285644531, "eval_runtime": 105.5778, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.606, "epoch": 4.0, "step": 5154}, {"loss": 1.2643, "grad_norm": 0.7075854539871216, "learning_rate": 0.0002, "epoch": 4.0046565774155995, "step": 5160}, {"loss": 1.209, "grad_norm": 0.9466007351875305, "learning_rate": 0.0002, "epoch": 4.012417539774932, "step": 5170}, {"loss": 1.2567, "grad_norm": 1.0297044515609741, "learning_rate": 0.0002, "epoch": 4.020178502134264, "step": 5180}, {"loss": 1.1796, "grad_norm": 0.7765059471130371, "learning_rate": 0.0002, "epoch": 4.027939464493597, "step": 5190}, {"loss": 1.2356, "grad_norm": 0.995760977268219, "learning_rate": 0.0002, "epoch": 4.03570042685293, "step": 5200}, {"loss": 1.1792, "grad_norm": 0.8663829565048218, "learning_rate": 0.0002, "epoch": 4.043461389212262, "step": 5210}, {"loss": 1.2471, "grad_norm": 1.0660825967788696, "learning_rate": 0.0002, "epoch": 4.051222351571595, "step": 5220}, {"loss": 1.1676, "grad_norm": 0.9858174920082092, "learning_rate": 0.0002, "epoch": 4.058983313930927, "step": 5230}, {"loss": 1.2448, "grad_norm": 0.8911338448524475, "learning_rate": 0.0002, "epoch": 4.06674427629026, "step": 5240}, {"loss": 1.1858, "grad_norm": 1.0848394632339478, "learning_rate": 0.0002, "epoch": 4.074505238649593, "step": 5250}, {"loss": 1.1684, "grad_norm": 1.0849905014038086, "learning_rate": 0.0002, "epoch": 4.082266201008925, "step": 5260}, {"loss": 1.2007, "grad_norm": 1.0497841835021973, "learning_rate": 0.0002, "epoch": 4.090027163368258, "step": 5270}, {"loss": 1.2552, "grad_norm": 0.8943053483963013, "learning_rate": 0.0002, "epoch": 4.0977881257275905, "step": 5280}, {"loss": 1.1923, "grad_norm": 0.8432527184486389, "learning_rate": 0.0002, "epoch": 4.1055490880869225, "step": 5290}, {"loss": 1.1634, "grad_norm": 0.9690414667129517, "learning_rate": 0.0002, "epoch": 4.113310050446255, "step": 5300}, {"loss": 1.3019, "grad_norm": 0.7790773510932922, "learning_rate": 0.0002, "epoch": 4.121071012805588, "step": 5310}, {"loss": 1.1806, "grad_norm": 0.9289211630821228, "learning_rate": 0.0002, "epoch": 4.12883197516492, "step": 5320}, {"loss": 1.1458, "grad_norm": 1.0785125494003296, "learning_rate": 0.0002, "epoch": 4.136592937524253, "step": 5330}, {"loss": 1.2086, "grad_norm": 0.8559591770172119, "learning_rate": 0.0002, "epoch": 4.144353899883585, "step": 5340}, {"loss": 1.1974, "grad_norm": 0.9405956268310547, "learning_rate": 0.0002, "epoch": 4.152114862242918, "step": 5350}, {"loss": 1.1793, "grad_norm": 0.9942827820777893, "learning_rate": 0.0002, "epoch": 4.159875824602251, "step": 5360}, {"loss": 1.1659, "grad_norm": 0.9141933917999268, "learning_rate": 0.0002, "epoch": 4.167636786961583, "step": 5370}, {"loss": 1.1647, "grad_norm": 0.8206015229225159, "learning_rate": 0.0002, "epoch": 4.175397749320916, "step": 5380}, {"loss": 1.2778, "grad_norm": 0.9340888857841492, "learning_rate": 0.0002, "epoch": 4.183158711680249, "step": 5390}, {"loss": 1.2459, "grad_norm": 1.2122114896774292, "learning_rate": 0.0002, "epoch": 4.190919674039581, "step": 5400}, {"loss": 1.2371, "grad_norm": 1.0661298036575317, "learning_rate": 0.0002, "epoch": 4.1986806363989135, "step": 5410}, {"loss": 1.1978, "grad_norm": 0.9372861385345459, "learning_rate": 0.0002, "epoch": 4.206441598758246, "step": 5420}, {"loss": 1.2653, "grad_norm": 0.894012987613678, "learning_rate": 0.0002, "epoch": 4.214202561117578, "step": 5430}, {"loss": 1.387, "grad_norm": 1.0647753477096558, "learning_rate": 0.0002, "epoch": 4.221963523476911, "step": 5440}, {"loss": 1.2231, "grad_norm": 0.989179790019989, "learning_rate": 0.0002, "epoch": 4.229724485836243, "step": 5450}, {"loss": 1.2715, "grad_norm": 1.1601181030273438, "learning_rate": 0.0002, "epoch": 4.237485448195576, "step": 5460}, {"loss": 1.2406, "grad_norm": 0.9395585656166077, "learning_rate": 0.0002, "epoch": 4.245246410554909, "step": 5470}, {"loss": 1.2779, "grad_norm": 0.9527766108512878, "learning_rate": 0.0002, "epoch": 4.253007372914241, "step": 5480}, {"loss": 1.267, "grad_norm": 1.0319520235061646, "learning_rate": 0.0002, "epoch": 4.260768335273574, "step": 5490}, {"loss": 1.2633, "grad_norm": 0.8659824728965759, "learning_rate": 0.0002, "epoch": 4.268529297632907, "step": 5500}, {"loss": 1.1475, "grad_norm": 1.099211573600769, "learning_rate": 0.0002, "epoch": 4.276290259992239, "step": 5510}, {"loss": 1.2508, "grad_norm": 0.9363361597061157, "learning_rate": 0.0002, "epoch": 4.284051222351572, "step": 5520}, {"loss": 1.189, "grad_norm": 0.8437647223472595, "learning_rate": 0.0002, "epoch": 4.2918121847109045, "step": 5530}, {"loss": 1.2212, "grad_norm": 0.9181258678436279, "learning_rate": 0.0002, "epoch": 4.2995731470702365, "step": 5540}, {"loss": 1.2092, "grad_norm": 0.9059357643127441, "learning_rate": 0.0002, "epoch": 4.307334109429569, "step": 5550}, {"loss": 1.2189, "grad_norm": 0.9337241649627686, "learning_rate": 0.0002, "epoch": 4.315095071788901, "step": 5560}, {"loss": 1.2462, "grad_norm": 0.9428889155387878, "learning_rate": 0.0002, "epoch": 4.322856034148234, "step": 5570}, {"loss": 1.2675, "grad_norm": 1.003589153289795, "learning_rate": 0.0002, "epoch": 4.330616996507567, "step": 5580}, {"loss": 1.2703, "grad_norm": 1.1249268054962158, "learning_rate": 0.0002, "epoch": 4.338377958866899, "step": 5590}, {"loss": 1.2501, "grad_norm": 0.8623469471931458, "learning_rate": 0.0002, "epoch": 4.346138921226232, "step": 5600}, {"loss": 1.2404, "grad_norm": 1.1389174461364746, "learning_rate": 0.0002, "epoch": 4.353899883585565, "step": 5610}, {"loss": 1.2245, "grad_norm": 1.0136264562606812, "learning_rate": 0.0002, "epoch": 4.361660845944897, "step": 5620}, {"loss": 1.3473, "grad_norm": 0.9567070603370667, "learning_rate": 0.0002, "epoch": 4.36942180830423, "step": 5630}, {"loss": 1.2988, "grad_norm": 1.0592148303985596, "learning_rate": 0.0002, "epoch": 4.377182770663563, "step": 5640}, {"loss": 1.212, "grad_norm": 1.0110485553741455, "learning_rate": 0.0002, "epoch": 4.384943733022895, "step": 5650}, {"loss": 1.2086, "grad_norm": 0.9914907217025757, "learning_rate": 0.0002, "epoch": 4.3927046953822275, "step": 5660}, {"loss": 1.2363, "grad_norm": 0.9447247982025146, "learning_rate": 0.0002, "epoch": 4.4004656577415595, "step": 5670}, {"loss": 1.2617, "grad_norm": 0.9644378423690796, "learning_rate": 0.0002, "epoch": 4.408226620100892, "step": 5680}, {"loss": 1.2773, "grad_norm": 0.920676589012146, "learning_rate": 0.0002, "epoch": 4.415987582460225, "step": 5690}, {"loss": 1.2792, "grad_norm": 1.060570478439331, "learning_rate": 0.0002, "epoch": 4.423748544819557, "step": 5700}, {"loss": 1.2374, "grad_norm": 0.8857738971710205, "learning_rate": 0.0002, "epoch": 4.43150950717889, "step": 5710}, {"loss": 1.2588, "grad_norm": 1.0536398887634277, "learning_rate": 0.0002, "epoch": 4.439270469538223, "step": 5720}, {"loss": 1.2051, "grad_norm": 0.990847110748291, "learning_rate": 0.0002, "epoch": 4.447031431897555, "step": 5730}, {"loss": 1.2469, "grad_norm": 0.9692499041557312, "learning_rate": 0.0002, "epoch": 4.454792394256888, "step": 5740}, {"loss": 1.2269, "grad_norm": 1.0376402139663696, "learning_rate": 0.0002, "epoch": 4.462553356616221, "step": 5750}, {"loss": 1.1701, "grad_norm": 1.3863259553909302, "learning_rate": 0.0002, "epoch": 4.470314318975553, "step": 5760}, {"loss": 1.2591, "grad_norm": 0.978379487991333, "learning_rate": 0.0002, "epoch": 4.478075281334886, "step": 5770}, {"loss": 1.2729, "grad_norm": 1.0973085165023804, "learning_rate": 0.0002, "epoch": 4.485836243694218, "step": 5780}, {"loss": 1.2404, "grad_norm": 1.057006597518921, "learning_rate": 0.0002, "epoch": 4.4935972060535505, "step": 5790}, {"loss": 1.2476, "grad_norm": 0.9247729182243347, "learning_rate": 0.0002, "epoch": 4.501358168412883, "step": 5800}, {"loss": 1.2369, "grad_norm": 1.0447787046432495, "learning_rate": 0.0002, "epoch": 4.509119130772215, "step": 5810}, {"loss": 1.211, "grad_norm": 1.1930429935455322, "learning_rate": 0.0002, "epoch": 4.516880093131548, "step": 5820}, {"loss": 1.2596, "grad_norm": 0.9867590069770813, "learning_rate": 0.0002, "epoch": 4.524641055490881, "step": 5830}, {"loss": 1.2766, "grad_norm": 0.9591100215911865, "learning_rate": 0.0002, "epoch": 4.532402017850213, "step": 5840}, {"loss": 1.2154, "grad_norm": 0.9950753450393677, "learning_rate": 0.0002, "epoch": 4.540162980209546, "step": 5850}, {"loss": 1.2149, "grad_norm": 1.0087506771087646, "learning_rate": 0.0002, "epoch": 4.547923942568879, "step": 5860}, {"loss": 1.3165, "grad_norm": 1.0934417247772217, "learning_rate": 0.0002, "epoch": 4.555684904928211, "step": 5870}, {"loss": 1.3059, "grad_norm": 1.107987403869629, "learning_rate": 0.0002, "epoch": 4.563445867287544, "step": 5880}, {"loss": 1.2184, "grad_norm": 0.9147276878356934, "learning_rate": 0.0002, "epoch": 4.571206829646876, "step": 5890}, {"loss": 1.24, "grad_norm": 1.036780595779419, "learning_rate": 0.0002, "epoch": 4.578967792006209, "step": 5900}, {"loss": 1.2209, "grad_norm": 0.9284719824790955, "learning_rate": 0.0002, "epoch": 4.5867287543655415, "step": 5910}, {"loss": 1.3693, "grad_norm": 0.9141898155212402, "learning_rate": 0.0002, "epoch": 4.5944897167248735, "step": 5920}, {"loss": 1.2319, "grad_norm": 1.0447357892990112, "learning_rate": 0.0002, "epoch": 4.602250679084206, "step": 5930}, {"loss": 1.2667, "grad_norm": 0.9309114217758179, "learning_rate": 0.0002, "epoch": 4.610011641443539, "step": 5940}, {"loss": 1.2827, "grad_norm": 1.2986129522323608, "learning_rate": 0.0002, "epoch": 4.617772603802871, "step": 5950}, {"loss": 1.312, "grad_norm": 0.9221704602241516, "learning_rate": 0.0002, "epoch": 4.625533566162204, "step": 5960}, {"loss": 1.2769, "grad_norm": 0.9228187799453735, "learning_rate": 0.0002, "epoch": 4.633294528521537, "step": 5970}, {"loss": 1.2953, "grad_norm": 0.9483116269111633, "learning_rate": 0.0002, "epoch": 4.641055490880869, "step": 5980}, {"loss": 1.3437, "grad_norm": 1.0218974351882935, "learning_rate": 0.0002, "epoch": 4.648816453240202, "step": 5990}, {"loss": 1.3085, "grad_norm": 0.9764600396156311, "learning_rate": 0.0002, "epoch": 4.656577415599534, "step": 6000}, {"loss": 1.197, "grad_norm": 0.9115710258483887, "learning_rate": 0.0002, "epoch": 4.664338377958867, "step": 6010}, {"loss": 1.1917, "grad_norm": 0.9245651364326477, "learning_rate": 0.0002, "epoch": 4.6720993403182, "step": 6020}, {"loss": 1.2969, "grad_norm": 0.9686311483383179, "learning_rate": 0.0002, "epoch": 4.6798603026775325, "step": 6030}, {"loss": 1.2702, "grad_norm": 1.1807392835617065, "learning_rate": 0.0002, "epoch": 4.6876212650368645, "step": 6040}, {"loss": 1.328, "grad_norm": 1.0358641147613525, "learning_rate": 0.0002, "epoch": 4.695382227396197, "step": 6050}, {"loss": 1.3281, "grad_norm": 0.987332284450531, "learning_rate": 0.0002, "epoch": 4.703143189755529, "step": 6060}, {"loss": 1.2514, "grad_norm": 1.0526494979858398, "learning_rate": 0.0002, "epoch": 4.710904152114862, "step": 6070}, {"loss": 1.2246, "grad_norm": 1.0276758670806885, "learning_rate": 0.0002, "epoch": 4.718665114474195, "step": 6080}, {"loss": 1.3367, "grad_norm": 0.9904406666755676, "learning_rate": 0.0002, "epoch": 4.726426076833527, "step": 6090}, {"loss": 1.2797, "grad_norm": 1.0084882974624634, "learning_rate": 0.0002, "epoch": 4.73418703919286, "step": 6100}, {"loss": 1.2656, "grad_norm": 0.8646450638771057, "learning_rate": 0.0002, "epoch": 4.741948001552192, "step": 6110}, {"loss": 1.3063, "grad_norm": 0.9233377575874329, "learning_rate": 0.0002, "epoch": 4.749708963911525, "step": 6120}, {"loss": 1.2642, "grad_norm": 0.9675140976905823, "learning_rate": 0.0002, "epoch": 4.757469926270858, "step": 6130}, {"loss": 1.3367, "grad_norm": 0.9639796018600464, "learning_rate": 0.0002, "epoch": 4.765230888630191, "step": 6140}, {"loss": 1.276, "grad_norm": 0.925199568271637, "learning_rate": 0.0002, "epoch": 4.772991850989523, "step": 6150}, {"loss": 1.2441, "grad_norm": 1.050901174545288, "learning_rate": 0.0002, "epoch": 4.7807528133488555, "step": 6160}, {"loss": 1.301, "grad_norm": 0.8920623660087585, "learning_rate": 0.0002, "epoch": 4.7885137757081875, "step": 6170}, {"loss": 1.263, "grad_norm": 0.8964757919311523, "learning_rate": 0.0002, "epoch": 4.79627473806752, "step": 6180}, {"loss": 1.2787, "grad_norm": 1.0839070081710815, "learning_rate": 0.0002, "epoch": 4.804035700426853, "step": 6190}, {"loss": 1.2664, "grad_norm": 0.8809942007064819, "learning_rate": 0.0002, "epoch": 4.811796662786185, "step": 6200}, {"loss": 1.321, "grad_norm": 1.0216195583343506, "learning_rate": 0.0002, "epoch": 4.819557625145518, "step": 6210}, {"loss": 1.3033, "grad_norm": 0.892005980014801, "learning_rate": 0.0002, "epoch": 4.827318587504851, "step": 6220}, {"loss": 1.2602, "grad_norm": 0.9957166910171509, "learning_rate": 0.0002, "epoch": 4.835079549864183, "step": 6230}, {"loss": 1.3562, "grad_norm": 0.9720533490180969, "learning_rate": 0.0002, "epoch": 4.842840512223516, "step": 6240}, {"loss": 1.2651, "grad_norm": 0.9336182475090027, "learning_rate": 0.0002, "epoch": 4.850601474582849, "step": 6250}, {"loss": 1.3136, "grad_norm": 1.2611457109451294, "learning_rate": 0.0002, "epoch": 4.858362436942181, "step": 6260}, {"loss": 1.2234, "grad_norm": 0.8927203416824341, "learning_rate": 0.0002, "epoch": 4.866123399301514, "step": 6270}, {"loss": 1.3463, "grad_norm": 0.9706710577011108, "learning_rate": 0.0002, "epoch": 4.873884361660846, "step": 6280}, {"loss": 1.3209, "grad_norm": 1.1461690664291382, "learning_rate": 0.0002, "epoch": 4.8816453240201785, "step": 6290}, {"loss": 1.2566, "grad_norm": 0.9930381178855896, "learning_rate": 0.0002, "epoch": 4.889406286379511, "step": 6300}, {"loss": 1.2568, "grad_norm": 0.91451096534729, "learning_rate": 0.0002, "epoch": 4.897167248738843, "step": 6310}, {"loss": 1.2836, "grad_norm": 1.0319571495056152, "learning_rate": 0.0002, "epoch": 4.904928211098176, "step": 6320}, {"loss": 1.2908, "grad_norm": 0.990140438079834, "learning_rate": 0.0002, "epoch": 4.912689173457509, "step": 6330}, {"loss": 1.3299, "grad_norm": 1.2466117143630981, "learning_rate": 0.0002, "epoch": 4.920450135816841, "step": 6340}, {"loss": 1.2659, "grad_norm": 1.0316979885101318, "learning_rate": 0.0002, "epoch": 4.928211098176174, "step": 6350}, {"loss": 1.3292, "grad_norm": 1.0643759965896606, "learning_rate": 0.0002, "epoch": 4.935972060535507, "step": 6360}, {"loss": 1.2559, "grad_norm": 0.9703279733657837, "learning_rate": 0.0002, "epoch": 4.943733022894839, "step": 6370}, {"loss": 1.2155, "grad_norm": 0.9767927527427673, "learning_rate": 0.0002, "epoch": 4.951493985254172, "step": 6380}, {"loss": 1.2437, "grad_norm": 0.960854172706604, "learning_rate": 0.0002, "epoch": 4.959254947613504, "step": 6390}, {"loss": 1.3314, "grad_norm": 0.9922910332679749, "learning_rate": 0.0002, "epoch": 4.967015909972837, "step": 6400}, {"loss": 1.3018, "grad_norm": 0.956470787525177, "learning_rate": 0.0002, "epoch": 4.9747768723321695, "step": 6410}, {"loss": 1.2794, "grad_norm": 0.9637242555618286, "learning_rate": 0.0002, "epoch": 4.9825378346915015, "step": 6420}, {"loss": 1.3236, "grad_norm": 1.0855202674865723, "learning_rate": 0.0002, "epoch": 4.990298797050834, "step": 6430}, {"loss": 1.3015, "grad_norm": 0.9655316472053528, "learning_rate": 0.0002, "epoch": 4.998059759410167, "step": 6440}, {"eval_loss": 2.0410802364349365, "eval_runtime": 113.04, "eval_samples_per_second": 4.485, "eval_steps_per_second": 0.566, "epoch": 4.9996119518820334, "step": 6442}, {"loss": 1.0846, "grad_norm": 1.1676199436187744, "learning_rate": 0.0002, "epoch": 5.005820721769499, "step": 6450}, {"loss": 1.041, "grad_norm": 1.4317965507507324, "learning_rate": 0.0002, "epoch": 5.013581684128832, "step": 6460}, {"loss": 0.9546, "grad_norm": 1.460443377494812, "learning_rate": 0.0002, "epoch": 5.021342646488165, "step": 6470}, {"loss": 1.0014, "grad_norm": 1.2299214601516724, "learning_rate": 0.0002, "epoch": 5.029103608847497, "step": 6480}, {"loss": 1.0397, "grad_norm": 1.3125724792480469, "learning_rate": 0.0002, "epoch": 5.03686457120683, "step": 6490}, {"loss": 1.0134, "grad_norm": 1.1252319812774658, "learning_rate": 0.0002, "epoch": 5.044625533566162, "step": 6500}, {"loss": 0.976, "grad_norm": 0.9970866441726685, "learning_rate": 0.0002, "epoch": 5.052386495925495, "step": 6510}, {"loss": 0.9731, "grad_norm": 1.229069709777832, "learning_rate": 0.0002, "epoch": 5.060147458284828, "step": 6520}, {"loss": 1.0498, "grad_norm": 1.2430938482284546, "learning_rate": 0.0002, "epoch": 5.06790842064416, "step": 6530}, {"loss": 1.0236, "grad_norm": 1.0522737503051758, "learning_rate": 0.0002, "epoch": 5.0756693830034925, "step": 6540}, {"loss": 1.0221, "grad_norm": 1.108890175819397, "learning_rate": 0.0002, "epoch": 5.083430345362825, "step": 6550}, {"loss": 1.0177, "grad_norm": 1.156912922859192, "learning_rate": 0.0002, "epoch": 5.091191307722157, "step": 6560}, {"loss": 1.0415, "grad_norm": 1.405895709991455, "learning_rate": 0.0002, "epoch": 5.09895227008149, "step": 6570}, {"loss": 0.9811, "grad_norm": 1.2005155086517334, "learning_rate": 0.0002, "epoch": 5.106713232440823, "step": 6580}, {"loss": 0.9862, "grad_norm": 1.181443452835083, "learning_rate": 0.0002, "epoch": 5.114474194800155, "step": 6590}, {"loss": 1.0291, "grad_norm": 2.3444771766662598, "learning_rate": 0.0002, "epoch": 5.122235157159488, "step": 6600}, {"loss": 1.0455, "grad_norm": 1.216988444328308, "learning_rate": 0.0002, "epoch": 5.12999611951882, "step": 6610}, {"loss": 1.0549, "grad_norm": 1.369553565979004, "learning_rate": 0.0002, "epoch": 5.137757081878153, "step": 6620}, {"loss": 1.0056, "grad_norm": 1.177964687347412, "learning_rate": 0.0002, "epoch": 5.145518044237486, "step": 6630}, {"loss": 1.1025, "grad_norm": 1.1397041082382202, "learning_rate": 0.0002, "epoch": 5.153279006596818, "step": 6640}, {"loss": 1.0437, "grad_norm": 1.3976861238479614, "learning_rate": 0.0002, "epoch": 5.161039968956151, "step": 6650}, {"loss": 1.0454, "grad_norm": 1.4824495315551758, "learning_rate": 0.0002, "epoch": 5.1688009313154835, "step": 6660}, {"loss": 1.0356, "grad_norm": 1.2653018236160278, "learning_rate": 0.0002, "epoch": 5.1765618936748155, "step": 6670}, {"loss": 0.9971, "grad_norm": 1.3106069564819336, "learning_rate": 0.0002, "epoch": 5.184322856034148, "step": 6680}, {"loss": 1.0561, "grad_norm": 1.3140279054641724, "learning_rate": 0.0002, "epoch": 5.192083818393481, "step": 6690}, {"loss": 1.0618, "grad_norm": 1.3900256156921387, "learning_rate": 0.0002, "epoch": 5.199844780752813, "step": 6700}, {"loss": 1.0285, "grad_norm": 1.3191124200820923, "learning_rate": 0.0002, "epoch": 5.207605743112146, "step": 6710}, {"loss": 0.9921, "grad_norm": 1.176107406616211, "learning_rate": 0.0002, "epoch": 5.215366705471478, "step": 6720}, {"loss": 1.064, "grad_norm": 1.2364883422851562, "learning_rate": 0.0002, "epoch": 5.223127667830811, "step": 6730}, {"loss": 0.9599, "grad_norm": 1.343022108078003, "learning_rate": 0.0002, "epoch": 5.230888630190144, "step": 6740}, {"loss": 1.0342, "grad_norm": 1.2826898097991943, "learning_rate": 0.0002, "epoch": 5.238649592549476, "step": 6750}, {"loss": 1.0703, "grad_norm": 1.500257134437561, "learning_rate": 0.0002, "epoch": 5.246410554908809, "step": 6760}, {"loss": 1.0114, "grad_norm": 1.2605743408203125, "learning_rate": 0.0002, "epoch": 5.254171517268142, "step": 6770}, {"loss": 1.0825, "grad_norm": 1.2355525493621826, "learning_rate": 0.0002, "epoch": 5.261932479627474, "step": 6780}, {"loss": 1.0436, "grad_norm": 1.2845789194107056, "learning_rate": 0.0002, "epoch": 5.2696934419868064, "step": 6790}, {"loss": 0.989, "grad_norm": 1.3696625232696533, "learning_rate": 0.0002, "epoch": 5.277454404346139, "step": 6800}, {"loss": 1.0991, "grad_norm": 1.4051260948181152, "learning_rate": 0.0002, "epoch": 5.285215366705471, "step": 6810}, {"loss": 1.0987, "grad_norm": 1.266725778579712, "learning_rate": 0.0002, "epoch": 5.292976329064804, "step": 6820}, {"loss": 1.0489, "grad_norm": 1.3475236892700195, "learning_rate": 0.0002, "epoch": 5.300737291424136, "step": 6830}, {"loss": 1.0264, "grad_norm": 1.54409921169281, "learning_rate": 0.0002, "epoch": 5.308498253783469, "step": 6840}, {"loss": 1.033, "grad_norm": 1.2391985654830933, "learning_rate": 0.0002, "epoch": 5.316259216142802, "step": 6850}, {"loss": 1.1058, "grad_norm": 1.2435699701309204, "learning_rate": 0.0002, "epoch": 5.324020178502134, "step": 6860}, {"loss": 1.0179, "grad_norm": 1.8803037405014038, "learning_rate": 0.0002, "epoch": 5.331781140861467, "step": 6870}, {"loss": 0.997, "grad_norm": 1.4195542335510254, "learning_rate": 0.0002, "epoch": 5.3395421032208, "step": 6880}, {"loss": 1.0273, "grad_norm": 1.1853394508361816, "learning_rate": 0.0002, "epoch": 5.347303065580132, "step": 6890}, {"loss": 1.0668, "grad_norm": 1.4016530513763428, "learning_rate": 0.0002, "epoch": 5.355064027939465, "step": 6900}, {"loss": 1.1099, "grad_norm": 1.294339895248413, "learning_rate": 0.0002, "epoch": 5.3628249902987974, "step": 6910}, {"loss": 1.0724, "grad_norm": 1.2952708005905151, "learning_rate": 0.0002, "epoch": 5.370585952658129, "step": 6920}, {"loss": 1.0098, "grad_norm": 1.1361510753631592, "learning_rate": 0.0002, "epoch": 5.378346915017462, "step": 6930}, {"loss": 1.0796, "grad_norm": 1.125805377960205, "learning_rate": 0.0002, "epoch": 5.386107877376794, "step": 6940}, {"loss": 1.122, "grad_norm": 1.1453300714492798, "learning_rate": 0.0002, "epoch": 5.393868839736127, "step": 6950}, {"loss": 1.0977, "grad_norm": 1.4542768001556396, "learning_rate": 0.0002, "epoch": 5.40162980209546, "step": 6960}, {"loss": 1.0825, "grad_norm": 1.2360988855361938, "learning_rate": 0.0002, "epoch": 5.409390764454792, "step": 6970}, {"loss": 1.0631, "grad_norm": 1.2182754278182983, "learning_rate": 0.0002, "epoch": 5.417151726814125, "step": 6980}, {"loss": 1.0471, "grad_norm": 1.2018693685531616, "learning_rate": 0.0002, "epoch": 5.424912689173458, "step": 6990}, {"loss": 1.108, "grad_norm": 1.346124291419983, "learning_rate": 0.0002, "epoch": 5.43267365153279, "step": 7000}, {"loss": 1.0534, "grad_norm": 1.2534189224243164, "learning_rate": 0.0002, "epoch": 5.440434613892123, "step": 7010}, {"loss": 1.0696, "grad_norm": 1.2033339738845825, "learning_rate": 0.0002, "epoch": 5.448195576251456, "step": 7020}, {"loss": 1.0714, "grad_norm": 1.2788134813308716, "learning_rate": 0.0002, "epoch": 5.4559565386107876, "step": 7030}, {"loss": 1.1274, "grad_norm": 1.2751542329788208, "learning_rate": 0.0002, "epoch": 5.46371750097012, "step": 7040}, {"loss": 1.0767, "grad_norm": 1.3237019777297974, "learning_rate": 0.0002, "epoch": 5.471478463329452, "step": 7050}, {"loss": 1.1081, "grad_norm": 1.4932852983474731, "learning_rate": 0.0002, "epoch": 5.479239425688785, "step": 7060}, {"loss": 1.0197, "grad_norm": 1.4003876447677612, "learning_rate": 0.0002, "epoch": 5.487000388048118, "step": 7070}, {"loss": 1.0662, "grad_norm": 1.404799461364746, "learning_rate": 0.0002, "epoch": 5.49476135040745, "step": 7080}, {"loss": 1.0354, "grad_norm": 1.4486982822418213, "learning_rate": 0.0002, "epoch": 5.502522312766783, "step": 7090}, {"loss": 1.0645, "grad_norm": 1.1713480949401855, "learning_rate": 0.0002, "epoch": 5.510283275126116, "step": 7100}, {"loss": 1.006, "grad_norm": 1.4062601327896118, "learning_rate": 0.0002, "epoch": 5.518044237485448, "step": 7110}, {"loss": 1.0459, "grad_norm": 1.211629867553711, "learning_rate": 0.0002, "epoch": 5.525805199844781, "step": 7120}, {"loss": 1.102, "grad_norm": 1.2523176670074463, "learning_rate": 0.0002, "epoch": 5.533566162204114, "step": 7130}, {"loss": 1.1132, "grad_norm": 1.4467198848724365, "learning_rate": 0.0002, "epoch": 5.541327124563446, "step": 7140}, {"loss": 1.1557, "grad_norm": 1.5961614847183228, "learning_rate": 0.0002, "epoch": 5.5490880869227786, "step": 7150}, {"loss": 1.0859, "grad_norm": 1.320656418800354, "learning_rate": 0.0002, "epoch": 5.5568490492821105, "step": 7160}, {"loss": 1.109, "grad_norm": 1.2423332929611206, "learning_rate": 0.0002, "epoch": 5.564610011641443, "step": 7170}, {"loss": 1.0046, "grad_norm": 1.2919669151306152, "learning_rate": 0.0002, "epoch": 5.572370974000776, "step": 7180}, {"loss": 1.046, "grad_norm": 1.1678385734558105, "learning_rate": 0.0002, "epoch": 5.580131936360108, "step": 7190}, {"loss": 1.1011, "grad_norm": 1.4250764846801758, "learning_rate": 0.0002, "epoch": 5.587892898719441, "step": 7200}, {"loss": 1.1254, "grad_norm": 1.5308716297149658, "learning_rate": 0.0002, "epoch": 5.595653861078774, "step": 7210}, {"loss": 1.121, "grad_norm": 1.2678815126419067, "learning_rate": 0.0002, "epoch": 5.603414823438106, "step": 7220}, {"loss": 1.0846, "grad_norm": 1.127856969833374, "learning_rate": 0.0002, "epoch": 5.611175785797439, "step": 7230}, {"loss": 1.0647, "grad_norm": 1.3832560777664185, "learning_rate": 0.0002, "epoch": 5.618936748156772, "step": 7240}, {"loss": 1.0658, "grad_norm": 1.3226919174194336, "learning_rate": 0.0002, "epoch": 5.626697710516104, "step": 7250}, {"loss": 1.1175, "grad_norm": 1.3418006896972656, "learning_rate": 0.0002, "epoch": 5.634458672875437, "step": 7260}, {"loss": 1.0956, "grad_norm": 1.2625300884246826, "learning_rate": 0.0002, "epoch": 5.642219635234769, "step": 7270}, {"loss": 1.067, "grad_norm": 1.1579464673995972, "learning_rate": 0.0002, "epoch": 5.6499805975941015, "step": 7280}, {"loss": 1.0447, "grad_norm": 1.4998650550842285, "learning_rate": 0.0002, "epoch": 5.657741559953434, "step": 7290}, {"loss": 1.1256, "grad_norm": 1.2670758962631226, "learning_rate": 0.0002, "epoch": 5.665502522312766, "step": 7300}, {"loss": 1.1267, "grad_norm": 1.2959760427474976, "learning_rate": 0.0002, "epoch": 5.673263484672099, "step": 7310}, {"loss": 1.1387, "grad_norm": 1.2460671663284302, "learning_rate": 0.0002, "epoch": 5.681024447031432, "step": 7320}, {"loss": 1.0756, "grad_norm": 1.1313989162445068, "learning_rate": 0.0002, "epoch": 5.688785409390764, "step": 7330}, {"loss": 1.0618, "grad_norm": 1.282527208328247, "learning_rate": 0.0002, "epoch": 5.696546371750097, "step": 7340}, {"loss": 1.1315, "grad_norm": 1.3380206823349, "learning_rate": 0.0002, "epoch": 5.70430733410943, "step": 7350}, {"loss": 1.0949, "grad_norm": 1.1648279428482056, "learning_rate": 0.0002, "epoch": 5.712068296468762, "step": 7360}, {"loss": 1.1705, "grad_norm": 1.3059816360473633, "learning_rate": 0.0002, "epoch": 5.719829258828095, "step": 7370}, {"loss": 1.1496, "grad_norm": 1.1905046701431274, "learning_rate": 0.0002, "epoch": 5.727590221187427, "step": 7380}, {"loss": 1.1356, "grad_norm": 1.4089630842208862, "learning_rate": 0.0002, "epoch": 5.73535118354676, "step": 7390}, {"loss": 1.1349, "grad_norm": 1.256721019744873, "learning_rate": 0.0002, "epoch": 5.7431121459060925, "step": 7400}, {"loss": 1.0682, "grad_norm": 1.1915162801742554, "learning_rate": 0.0002, "epoch": 5.7508731082654245, "step": 7410}, {"loss": 1.1257, "grad_norm": 1.1935480833053589, "learning_rate": 0.0002, "epoch": 5.758634070624757, "step": 7420}, {"loss": 1.1348, "grad_norm": 1.1761008501052856, "learning_rate": 0.0002, "epoch": 5.76639503298409, "step": 7430}, {"loss": 1.0837, "grad_norm": 1.2540549039840698, "learning_rate": 0.0002, "epoch": 5.774155995343422, "step": 7440}, {"loss": 1.1527, "grad_norm": 1.5295120477676392, "learning_rate": 0.0002, "epoch": 5.781916957702755, "step": 7450}, {"loss": 1.1146, "grad_norm": 1.1081160306930542, "learning_rate": 0.0002, "epoch": 5.789677920062088, "step": 7460}, {"loss": 1.1304, "grad_norm": 1.4381253719329834, "learning_rate": 0.0002, "epoch": 5.79743888242142, "step": 7470}, {"loss": 1.0684, "grad_norm": 1.3079341650009155, "learning_rate": 0.0002, "epoch": 5.805199844780753, "step": 7480}, {"loss": 1.0544, "grad_norm": 1.1372792720794678, "learning_rate": 0.0002, "epoch": 5.812960807140085, "step": 7490}, {"loss": 1.1622, "grad_norm": 1.3221744298934937, "learning_rate": 0.0002, "epoch": 5.820721769499418, "step": 7500}, {"loss": 1.1515, "grad_norm": 1.3436939716339111, "learning_rate": 0.0002, "epoch": 5.828482731858751, "step": 7510}, {"loss": 1.1154, "grad_norm": 1.3916879892349243, "learning_rate": 0.0002, "epoch": 5.8362436942180835, "step": 7520}, {"loss": 1.0816, "grad_norm": 1.2463704347610474, "learning_rate": 0.0002, "epoch": 5.8440046565774155, "step": 7530}, {"loss": 1.0745, "grad_norm": 1.097051739692688, "learning_rate": 0.0002, "epoch": 5.851765618936748, "step": 7540}, {"loss": 1.1454, "grad_norm": 1.1554739475250244, "learning_rate": 0.0002, "epoch": 5.85952658129608, "step": 7550}, {"loss": 1.0953, "grad_norm": 1.2384694814682007, "learning_rate": 0.0002, "epoch": 5.867287543655413, "step": 7560}, {"loss": 1.1734, "grad_norm": 1.142815351486206, "learning_rate": 0.0002, "epoch": 5.875048506014746, "step": 7570}, {"loss": 1.162, "grad_norm": 1.3637062311172485, "learning_rate": 0.0002, "epoch": 5.882809468374078, "step": 7580}, {"loss": 1.0781, "grad_norm": 1.2449073791503906, "learning_rate": 0.0002, "epoch": 5.890570430733411, "step": 7590}, {"loss": 1.1191, "grad_norm": 1.358058214187622, "learning_rate": 0.0002, "epoch": 5.898331393092743, "step": 7600}, {"loss": 1.0779, "grad_norm": 1.264655351638794, "learning_rate": 0.0002, "epoch": 5.906092355452076, "step": 7610}, {"loss": 1.1538, "grad_norm": 1.3186019659042358, "learning_rate": 0.0002, "epoch": 5.913853317811409, "step": 7620}, {"loss": 1.1076, "grad_norm": 1.4111460447311401, "learning_rate": 0.0002, "epoch": 5.921614280170742, "step": 7630}, {"loss": 1.1765, "grad_norm": 1.1078972816467285, "learning_rate": 0.0002, "epoch": 5.929375242530074, "step": 7640}, {"loss": 1.1305, "grad_norm": 1.2742213010787964, "learning_rate": 0.0002, "epoch": 5.9371362048894065, "step": 7650}, {"loss": 1.144, "grad_norm": 1.3412781953811646, "learning_rate": 0.0002, "epoch": 5.9448971672487385, "step": 7660}, {"loss": 1.1642, "grad_norm": 1.123005986213684, "learning_rate": 0.0002, "epoch": 5.952658129608071, "step": 7670}, {"loss": 1.0732, "grad_norm": 1.2203444242477417, "learning_rate": 0.0002, "epoch": 5.960419091967404, "step": 7680}, {"loss": 1.158, "grad_norm": 1.341011643409729, "learning_rate": 0.0002, "epoch": 5.968180054326736, "step": 7690}, {"loss": 1.1144, "grad_norm": 1.2689454555511475, "learning_rate": 0.0002, "epoch": 5.975941016686069, "step": 7700}, {"loss": 1.2051, "grad_norm": 1.1518112421035767, "learning_rate": 0.0002, "epoch": 5.983701979045401, "step": 7710}, {"loss": 1.1868, "grad_norm": 1.3698320388793945, "learning_rate": 0.0002, "epoch": 5.991462941404734, "step": 7720}, {"loss": 1.0651, "grad_norm": 1.2812788486480713, "learning_rate": 0.0002, "epoch": 5.999223903764067, "step": 7730}, {"eval_loss": 2.252762794494629, "eval_runtime": 114.8471, "eval_samples_per_second": 4.415, "eval_steps_per_second": 0.557, "epoch": 6.0, "step": 7731}, {"loss": 0.8629, "grad_norm": 1.8642009496688843, "learning_rate": 0.0002, "epoch": 6.006984866123399, "step": 7740}, {"loss": 0.8435, "grad_norm": 1.7081232070922852, "learning_rate": 0.0002, "epoch": 6.014745828482732, "step": 7750}, {"loss": 0.7729, "grad_norm": 1.6233899593353271, "learning_rate": 0.0002, "epoch": 6.022506790842065, "step": 7760}, {"loss": 0.7907, "grad_norm": 1.5111888647079468, "learning_rate": 0.0002, "epoch": 6.030267753201397, "step": 7770}, {"loss": 0.7908, "grad_norm": 1.5278418064117432, "learning_rate": 0.0002, "epoch": 6.0380287155607295, "step": 7780}, {"loss": 0.835, "grad_norm": 1.5932185649871826, "learning_rate": 0.0002, "epoch": 6.045789677920062, "step": 7790}, {"loss": 0.7682, "grad_norm": 1.5990597009658813, "learning_rate": 0.0002, "epoch": 6.053550640279394, "step": 7800}, {"loss": 0.8559, "grad_norm": 1.7498669624328613, "learning_rate": 0.0002, "epoch": 6.061311602638727, "step": 7810}, {"loss": 0.8069, "grad_norm": 1.6105555295944214, "learning_rate": 0.0002, "epoch": 6.06907256499806, "step": 7820}, {"loss": 0.8473, "grad_norm": 1.5214293003082275, "learning_rate": 0.0002, "epoch": 6.076833527357392, "step": 7830}, {"loss": 0.8328, "grad_norm": 1.6586973667144775, "learning_rate": 0.0002, "epoch": 6.084594489716725, "step": 7840}, {"loss": 0.8415, "grad_norm": 1.467391848564148, "learning_rate": 0.0002, "epoch": 6.092355452076057, "step": 7850}, {"loss": 0.8274, "grad_norm": 1.537361741065979, "learning_rate": 0.0002, "epoch": 6.10011641443539, "step": 7860}, {"loss": 0.8011, "grad_norm": 1.621764898300171, "learning_rate": 0.0002, "epoch": 6.107877376794723, "step": 7870}, {"loss": 0.8556, "grad_norm": 1.583751916885376, "learning_rate": 0.0002, "epoch": 6.115638339154055, "step": 7880}, {"loss": 0.8829, "grad_norm": 1.6199619770050049, "learning_rate": 0.0002, "epoch": 6.123399301513388, "step": 7890}, {"loss": 0.8226, "grad_norm": 1.6163095235824585, "learning_rate": 0.0002, "epoch": 6.1311602638727205, "step": 7900}, {"loss": 0.8203, "grad_norm": 1.6120976209640503, "learning_rate": 0.0002, "epoch": 6.1389212262320525, "step": 7910}, {"loss": 0.7915, "grad_norm": 1.7886850833892822, "learning_rate": 0.0002, "epoch": 6.146682188591385, "step": 7920}, {"loss": 0.7808, "grad_norm": 1.408303141593933, "learning_rate": 0.0002, "epoch": 6.154443150950718, "step": 7930}, {"loss": 0.8404, "grad_norm": 1.6048113107681274, "learning_rate": 0.0002, "epoch": 6.16220411331005, "step": 7940}, {"loss": 0.8705, "grad_norm": 1.424306869506836, "learning_rate": 0.0002, "epoch": 6.169965075669383, "step": 7950}, {"loss": 0.8177, "grad_norm": 1.4453672170639038, "learning_rate": 0.0002, "epoch": 6.177726038028716, "step": 7960}, {"loss": 0.8182, "grad_norm": 1.3157061338424683, "learning_rate": 0.0002, "epoch": 6.185487000388048, "step": 7970}, {"loss": 0.891, "grad_norm": 1.330541729927063, "learning_rate": 0.0002, "epoch": 6.193247962747381, "step": 7980}, {"loss": 0.8599, "grad_norm": 1.6306229829788208, "learning_rate": 0.0002, "epoch": 6.201008925106713, "step": 7990}, {"loss": 0.9069, "grad_norm": 1.6332136392593384, "learning_rate": 0.0002, "epoch": 6.208769887466046, "step": 8000}, {"loss": 0.83, "grad_norm": 1.708613395690918, "learning_rate": 0.0002, "epoch": 6.216530849825379, "step": 8010}, {"loss": 0.8509, "grad_norm": 1.6637346744537354, "learning_rate": 0.0002, "epoch": 6.224291812184711, "step": 8020}, {"loss": 0.84, "grad_norm": 1.5675315856933594, "learning_rate": 0.0002, "epoch": 6.2320527745440435, "step": 8030}, {"loss": 0.8491, "grad_norm": 1.5826327800750732, "learning_rate": 0.0002, "epoch": 6.239813736903376, "step": 8040}, {"loss": 0.8374, "grad_norm": 1.7382984161376953, "learning_rate": 0.0002, "epoch": 6.247574699262708, "step": 8050}, {"loss": 0.8795, "grad_norm": 1.5272295475006104, "learning_rate": 0.0002, "epoch": 6.255335661622041, "step": 8060}, {"loss": 0.8745, "grad_norm": 1.8195022344589233, "learning_rate": 0.0002, "epoch": 6.263096623981374, "step": 8070}, {"loss": 0.8743, "grad_norm": 1.679901361465454, "learning_rate": 0.0002, "epoch": 6.270857586340706, "step": 8080}, {"loss": 0.9006, "grad_norm": 1.4921348094940186, "learning_rate": 0.0002, "epoch": 6.278618548700039, "step": 8090}, {"loss": 0.899, "grad_norm": 1.4627857208251953, "learning_rate": 0.0002, "epoch": 6.286379511059371, "step": 8100}, {"loss": 0.8944, "grad_norm": 1.3528631925582886, "learning_rate": 0.0002, "epoch": 6.294140473418704, "step": 8110}, {"loss": 0.9355, "grad_norm": 1.6863102912902832, "learning_rate": 0.0002, "epoch": 6.301901435778037, "step": 8120}, {"loss": 0.8764, "grad_norm": 1.6178052425384521, "learning_rate": 0.0002, "epoch": 6.309662398137369, "step": 8130}, {"loss": 0.9182, "grad_norm": 1.7626280784606934, "learning_rate": 0.0002, "epoch": 6.317423360496702, "step": 8140}, {"loss": 0.8886, "grad_norm": 1.7188845872879028, "learning_rate": 0.0002, "epoch": 6.3251843228560345, "step": 8150}, {"loss": 0.895, "grad_norm": 1.5777133703231812, "learning_rate": 0.0002, "epoch": 6.3329452852153665, "step": 8160}, {"loss": 0.9247, "grad_norm": 1.7653207778930664, "learning_rate": 0.0002, "epoch": 6.340706247574699, "step": 8170}, {"loss": 0.8003, "grad_norm": 1.6861237287521362, "learning_rate": 0.0002, "epoch": 6.348467209934032, "step": 8180}, {"loss": 0.884, "grad_norm": 1.6318124532699585, "learning_rate": 0.0002, "epoch": 6.356228172293364, "step": 8190}, {"loss": 0.8341, "grad_norm": 1.6192939281463623, "learning_rate": 0.0002, "epoch": 6.363989134652697, "step": 8200}, {"loss": 0.8939, "grad_norm": 1.7641773223876953, "learning_rate": 0.0002, "epoch": 6.371750097012029, "step": 8210}, {"loss": 0.8582, "grad_norm": 1.6470493078231812, "learning_rate": 0.0002, "epoch": 6.379511059371362, "step": 8220}, {"loss": 0.9351, "grad_norm": 1.5898468494415283, "learning_rate": 0.0002, "epoch": 6.387272021730695, "step": 8230}, {"loss": 0.9658, "grad_norm": 1.8025981187820435, "learning_rate": 0.0002, "epoch": 6.395032984090027, "step": 8240}, {"loss": 0.8953, "grad_norm": 1.7035106420516968, "learning_rate": 0.0002, "epoch": 6.40279394644936, "step": 8250}, {"loss": 0.9193, "grad_norm": 1.5968799591064453, "learning_rate": 0.0002, "epoch": 6.410554908808693, "step": 8260}, {"loss": 0.929, "grad_norm": 1.7492800951004028, "learning_rate": 0.0002, "epoch": 6.418315871168025, "step": 8270}, {"loss": 0.9297, "grad_norm": 1.6914138793945312, "learning_rate": 0.0002, "epoch": 6.4260768335273575, "step": 8280}, {"loss": 0.8878, "grad_norm": 1.5761380195617676, "learning_rate": 0.0002, "epoch": 6.43383779588669, "step": 8290}, {"loss": 0.8761, "grad_norm": 1.5164411067962646, "learning_rate": 0.0002, "epoch": 6.441598758246022, "step": 8300}, {"loss": 0.88, "grad_norm": 1.6600215435028076, "learning_rate": 0.0002, "epoch": 6.449359720605355, "step": 8310}, {"loss": 0.9113, "grad_norm": 1.2477679252624512, "learning_rate": 0.0002, "epoch": 6.457120682964687, "step": 8320}, {"loss": 0.8822, "grad_norm": 1.3698599338531494, "learning_rate": 0.0002, "epoch": 6.46488164532402, "step": 8330}, {"loss": 0.9295, "grad_norm": 1.4847341775894165, "learning_rate": 0.0002, "epoch": 6.472642607683353, "step": 8340}, {"loss": 0.9243, "grad_norm": 1.4713412523269653, "learning_rate": 0.0002, "epoch": 6.480403570042685, "step": 8350}, {"loss": 0.9102, "grad_norm": 1.334523320198059, "learning_rate": 0.0002, "epoch": 6.488164532402018, "step": 8360}, {"loss": 0.8563, "grad_norm": 2.0054359436035156, "learning_rate": 0.0002, "epoch": 6.495925494761351, "step": 8370}, {"loss": 0.9759, "grad_norm": 1.560014247894287, "learning_rate": 0.0002, "epoch": 6.503686457120683, "step": 8380}, {"loss": 0.8542, "grad_norm": 1.518526554107666, "learning_rate": 0.0002, "epoch": 6.511447419480016, "step": 8390}, {"loss": 0.937, "grad_norm": 1.3841272592544556, "learning_rate": 0.0002, "epoch": 6.5192083818393485, "step": 8400}, {"loss": 0.9576, "grad_norm": 1.5191527605056763, "learning_rate": 0.0002, "epoch": 6.5269693441986805, "step": 8410}, {"loss": 0.8899, "grad_norm": 1.5275579690933228, "learning_rate": 0.0002, "epoch": 6.534730306558013, "step": 8420}, {"loss": 0.9291, "grad_norm": 1.621590256690979, "learning_rate": 0.0002, "epoch": 6.542491268917345, "step": 8430}, {"loss": 0.9011, "grad_norm": 1.7939082384109497, "learning_rate": 0.0002, "epoch": 6.550252231276678, "step": 8440}, {"loss": 0.8896, "grad_norm": 1.4542964696884155, "learning_rate": 0.0002, "epoch": 6.558013193636011, "step": 8450}, {"loss": 0.9393, "grad_norm": 1.5458455085754395, "learning_rate": 0.0002, "epoch": 6.565774155995343, "step": 8460}, {"loss": 0.9028, "grad_norm": 1.550359845161438, "learning_rate": 0.0002, "epoch": 6.573535118354676, "step": 8470}, {"loss": 0.9271, "grad_norm": 1.527757167816162, "learning_rate": 0.0002, "epoch": 6.581296080714009, "step": 8480}, {"loss": 0.966, "grad_norm": 1.4683486223220825, "learning_rate": 0.0002, "epoch": 6.589057043073341, "step": 8490}, {"loss": 0.9079, "grad_norm": 1.5057084560394287, "learning_rate": 0.0002, "epoch": 6.596818005432674, "step": 8500}, {"loss": 0.9235, "grad_norm": 1.648289442062378, "learning_rate": 0.0002, "epoch": 6.604578967792007, "step": 8510}, {"loss": 0.9113, "grad_norm": 1.578914761543274, "learning_rate": 0.0002, "epoch": 6.612339930151339, "step": 8520}, {"loss": 0.8894, "grad_norm": 1.5064080953598022, "learning_rate": 0.0002, "epoch": 6.6201008925106715, "step": 8530}, {"loss": 0.8981, "grad_norm": 1.5717744827270508, "learning_rate": 0.0002, "epoch": 6.6278618548700035, "step": 8540}, {"loss": 0.887, "grad_norm": 1.7954767942428589, "learning_rate": 0.0002, "epoch": 6.635622817229336, "step": 8550}, {"loss": 0.927, "grad_norm": 1.6172343492507935, "learning_rate": 0.0002, "epoch": 6.643383779588669, "step": 8560}, {"loss": 0.9384, "grad_norm": 1.6627886295318604, "learning_rate": 0.0002, "epoch": 6.651144741948001, "step": 8570}, {"loss": 0.959, "grad_norm": 1.5264919996261597, "learning_rate": 0.0002, "epoch": 6.658905704307334, "step": 8580}, {"loss": 0.9103, "grad_norm": 1.609248161315918, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 8590}, {"loss": 0.9395, "grad_norm": 1.5474581718444824, "learning_rate": 0.0002, "epoch": 6.674427629025999, "step": 8600}, {"loss": 0.9018, "grad_norm": 1.6294898986816406, "learning_rate": 0.0002, "epoch": 6.682188591385332, "step": 8610}, {"loss": 0.9323, "grad_norm": 1.612615942955017, "learning_rate": 0.0002, "epoch": 6.689949553744665, "step": 8620}, {"loss": 0.9218, "grad_norm": 1.741325855255127, "learning_rate": 0.0002, "epoch": 6.697710516103997, "step": 8630}, {"loss": 1.0475, "grad_norm": 1.5089004039764404, "learning_rate": 0.0002, "epoch": 6.70547147846333, "step": 8640}, {"loss": 1.0009, "grad_norm": 1.4725582599639893, "learning_rate": 0.0002, "epoch": 6.713232440822662, "step": 8650}, {"loss": 0.9818, "grad_norm": 1.6992095708847046, "learning_rate": 0.0002, "epoch": 6.7209934031819945, "step": 8660}, {"loss": 0.9229, "grad_norm": 1.5938470363616943, "learning_rate": 0.0002, "epoch": 6.728754365541327, "step": 8670}, {"loss": 0.9411, "grad_norm": 1.58723783493042, "learning_rate": 0.0002, "epoch": 6.736515327900659, "step": 8680}, {"loss": 0.9738, "grad_norm": 1.514389991760254, "learning_rate": 0.0002, "epoch": 6.744276290259992, "step": 8690}, {"loss": 0.9283, "grad_norm": 1.6799157857894897, "learning_rate": 0.0002, "epoch": 6.752037252619325, "step": 8700}, {"loss": 0.9138, "grad_norm": 1.5436359643936157, "learning_rate": 0.0002, "epoch": 6.759798214978657, "step": 8710}, {"loss": 0.9678, "grad_norm": 1.477137565612793, "learning_rate": 0.0002, "epoch": 6.76755917733799, "step": 8720}, {"loss": 1.0044, "grad_norm": 1.7383503913879395, "learning_rate": 0.0002, "epoch": 6.775320139697323, "step": 8730}, {"loss": 0.9492, "grad_norm": 1.8000324964523315, "learning_rate": 0.0002, "epoch": 6.783081102056655, "step": 8740}, {"loss": 0.8943, "grad_norm": 1.3099453449249268, "learning_rate": 0.0002, "epoch": 6.790842064415988, "step": 8750}, {"loss": 0.9709, "grad_norm": 1.8775172233581543, "learning_rate": 0.0002, "epoch": 6.79860302677532, "step": 8760}, {"loss": 0.9356, "grad_norm": 1.5832085609436035, "learning_rate": 0.0002, "epoch": 6.806363989134653, "step": 8770}, {"loss": 0.9397, "grad_norm": 1.4903252124786377, "learning_rate": 0.0002, "epoch": 6.8141249514939854, "step": 8780}, {"loss": 0.9602, "grad_norm": 1.6360470056533813, "learning_rate": 0.0002, "epoch": 6.821885913853317, "step": 8790}, {"loss": 0.957, "grad_norm": 1.5457707643508911, "learning_rate": 0.0002, "epoch": 6.82964687621265, "step": 8800}, {"loss": 0.943, "grad_norm": 1.5449066162109375, "learning_rate": 0.0002, "epoch": 6.837407838571983, "step": 8810}, {"loss": 1.0007, "grad_norm": 1.618337631225586, "learning_rate": 0.0002, "epoch": 6.845168800931315, "step": 8820}, {"loss": 0.9314, "grad_norm": 1.38296639919281, "learning_rate": 0.0002, "epoch": 6.852929763290648, "step": 8830}, {"loss": 0.9349, "grad_norm": 1.6427991390228271, "learning_rate": 0.0002, "epoch": 6.860690725649981, "step": 8840}, {"loss": 1.0194, "grad_norm": 1.4980270862579346, "learning_rate": 0.0002, "epoch": 6.868451688009313, "step": 8850}, {"loss": 0.9541, "grad_norm": 1.3800020217895508, "learning_rate": 0.0002, "epoch": 6.876212650368646, "step": 8860}, {"loss": 1.0102, "grad_norm": 1.5971838235855103, "learning_rate": 0.0002, "epoch": 6.883973612727978, "step": 8870}, {"loss": 1.0105, "grad_norm": 1.4429489374160767, "learning_rate": 0.0002, "epoch": 6.891734575087311, "step": 8880}, {"loss": 0.9143, "grad_norm": 1.4959166049957275, "learning_rate": 0.0002, "epoch": 6.899495537446644, "step": 8890}, {"loss": 0.9403, "grad_norm": 1.5776222944259644, "learning_rate": 0.0002, "epoch": 6.907256499805976, "step": 8900}, {"loss": 0.9256, "grad_norm": 1.510412573814392, "learning_rate": 0.0002, "epoch": 6.915017462165308, "step": 8910}, {"loss": 1.0095, "grad_norm": 1.7216295003890991, "learning_rate": 0.0002, "epoch": 6.922778424524641, "step": 8920}, {"loss": 0.9464, "grad_norm": 1.830762505531311, "learning_rate": 0.0002, "epoch": 6.930539386883973, "step": 8930}, {"loss": 0.9704, "grad_norm": 1.3472434282302856, "learning_rate": 0.0002, "epoch": 6.938300349243306, "step": 8940}, {"loss": 0.9718, "grad_norm": 1.5748040676116943, "learning_rate": 0.0002, "epoch": 6.946061311602639, "step": 8950}, {"loss": 0.9891, "grad_norm": 1.5317506790161133, "learning_rate": 0.0002, "epoch": 6.953822273961971, "step": 8960}, {"loss": 0.9513, "grad_norm": 1.5565721988677979, "learning_rate": 0.0002, "epoch": 6.961583236321304, "step": 8970}, {"loss": 0.9118, "grad_norm": 1.5288970470428467, "learning_rate": 0.0002, "epoch": 6.969344198680636, "step": 8980}, {"loss": 0.9789, "grad_norm": 1.562624454498291, "learning_rate": 0.0002, "epoch": 6.977105161039969, "step": 8990}, {"loss": 0.9929, "grad_norm": 1.3777633905410767, "learning_rate": 0.0002, "epoch": 6.984866123399302, "step": 9000}, {"loss": 0.9713, "grad_norm": 1.5868972539901733, "learning_rate": 0.0002, "epoch": 6.992627085758635, "step": 9010}, {"eval_loss": 2.4372169971466064, "eval_runtime": 113.8966, "eval_samples_per_second": 4.451, "eval_steps_per_second": 0.562, "epoch": 6.9996119518820334, "step": 9019}, {"loss": 0.9911, "grad_norm": 1.3035310506820679, "learning_rate": 0.0002, "epoch": 7.0003880481179666, "step": 9020}, {"loss": 0.6585, "grad_norm": 1.5640218257904053, "learning_rate": 0.0002, "epoch": 7.008149010477299, "step": 9030}, {"loss": 0.6507, "grad_norm": 1.9529098272323608, "learning_rate": 0.0002, "epoch": 7.015909972836631, "step": 9040}, {"loss": 0.6335, "grad_norm": 1.6257457733154297, "learning_rate": 0.0002, "epoch": 7.023670935195964, "step": 9050}, {"loss": 0.6752, "grad_norm": 1.8028602600097656, "learning_rate": 0.0002, "epoch": 7.031431897555297, "step": 9060}, {"loss": 0.66, "grad_norm": 1.4882043600082397, "learning_rate": 0.0002, "epoch": 7.039192859914629, "step": 9070}, {"loss": 0.7117, "grad_norm": 2.10062837600708, "learning_rate": 0.0002, "epoch": 7.046953822273962, "step": 9080}, {"loss": 0.643, "grad_norm": 1.6754050254821777, "learning_rate": 0.0002, "epoch": 7.054714784633295, "step": 9090}, {"loss": 0.6461, "grad_norm": 1.9425220489501953, "learning_rate": 0.0002, "epoch": 7.062475746992627, "step": 9100}, {"loss": 0.6441, "grad_norm": 1.9451842308044434, "learning_rate": 0.0002, "epoch": 7.07023670935196, "step": 9110}, {"loss": 0.6872, "grad_norm": 2.203806161880493, "learning_rate": 0.0002, "epoch": 7.077997671711292, "step": 9120}, {"loss": 0.6768, "grad_norm": 1.7248806953430176, "learning_rate": 0.0002, "epoch": 7.085758634070625, "step": 9130}, {"loss": 0.6911, "grad_norm": 1.5713436603546143, "learning_rate": 0.0002, "epoch": 7.0935195964299576, "step": 9140}, {"loss": 0.6689, "grad_norm": 2.0308637619018555, "learning_rate": 0.0002, "epoch": 7.1012805587892895, "step": 9150}, {"loss": 0.6617, "grad_norm": 1.7522761821746826, "learning_rate": 0.0002, "epoch": 7.109041521148622, "step": 9160}, {"loss": 0.669, "grad_norm": 1.9685192108154297, "learning_rate": 0.0002, "epoch": 7.116802483507955, "step": 9170}, {"loss": 0.6727, "grad_norm": 2.069120407104492, "learning_rate": 0.0002, "epoch": 7.124563445867287, "step": 9180}, {"loss": 0.6553, "grad_norm": 1.7211129665374756, "learning_rate": 0.0002, "epoch": 7.13232440822662, "step": 9190}, {"loss": 0.6827, "grad_norm": 1.7535923719406128, "learning_rate": 0.0002, "epoch": 7.140085370585953, "step": 9200}, {"loss": 0.6698, "grad_norm": 1.7181583642959595, "learning_rate": 0.0002, "epoch": 7.147846332945285, "step": 9210}, {"loss": 0.7056, "grad_norm": 1.7778624296188354, "learning_rate": 0.0002, "epoch": 7.155607295304618, "step": 9220}, {"loss": 0.7326, "grad_norm": 1.8051576614379883, "learning_rate": 0.0002, "epoch": 7.16336825766395, "step": 9230}, {"loss": 0.6858, "grad_norm": 1.9704501628875732, "learning_rate": 0.0002, "epoch": 7.171129220023283, "step": 9240}, {"loss": 0.7029, "grad_norm": 1.8750483989715576, "learning_rate": 0.0002, "epoch": 7.178890182382616, "step": 9250}, {"loss": 0.7073, "grad_norm": 1.790107011795044, "learning_rate": 0.0002, "epoch": 7.186651144741948, "step": 9260}, {"loss": 0.6875, "grad_norm": 1.9681477546691895, "learning_rate": 0.0002, "epoch": 7.1944121071012805, "step": 9270}, {"loss": 0.7074, "grad_norm": 1.7811331748962402, "learning_rate": 0.0002, "epoch": 7.202173069460613, "step": 9280}, {"loss": 0.6959, "grad_norm": 1.9861894845962524, "learning_rate": 0.0002, "epoch": 7.209934031819945, "step": 9290}, {"loss": 0.6463, "grad_norm": 1.8615444898605347, "learning_rate": 0.0002, "epoch": 7.217694994179278, "step": 9300}, {"loss": 0.7071, "grad_norm": 1.899372935295105, "learning_rate": 0.0002, "epoch": 7.225455956538611, "step": 9310}, {"loss": 0.6883, "grad_norm": 1.8131160736083984, "learning_rate": 0.0002, "epoch": 7.233216918897943, "step": 9320}, {"loss": 0.6813, "grad_norm": 1.5020049810409546, "learning_rate": 0.0002, "epoch": 7.240977881257276, "step": 9330}, {"loss": 0.7197, "grad_norm": 1.7210577726364136, "learning_rate": 0.0002, "epoch": 7.248738843616608, "step": 9340}, {"loss": 0.6769, "grad_norm": 2.003021001815796, "learning_rate": 0.0002, "epoch": 7.256499805975941, "step": 9350}, {"loss": 0.6746, "grad_norm": 1.6632959842681885, "learning_rate": 0.0002, "epoch": 7.264260768335274, "step": 9360}, {"loss": 0.7072, "grad_norm": 1.7717185020446777, "learning_rate": 0.0002, "epoch": 7.272021730694606, "step": 9370}, {"loss": 0.7447, "grad_norm": 1.8554900884628296, "learning_rate": 0.0002, "epoch": 7.279782693053939, "step": 9380}, {"loss": 0.6983, "grad_norm": 1.889708399772644, "learning_rate": 0.0002, "epoch": 7.2875436554132715, "step": 9390}, {"loss": 0.7495, "grad_norm": 1.8426263332366943, "learning_rate": 0.0002, "epoch": 7.2953046177726035, "step": 9400}, {"loss": 0.6854, "grad_norm": 1.9665130376815796, "learning_rate": 0.0002, "epoch": 7.303065580131936, "step": 9410}, {"loss": 0.6466, "grad_norm": 1.9337282180786133, "learning_rate": 0.0002, "epoch": 7.310826542491269, "step": 9420}, {"loss": 0.6885, "grad_norm": 1.8582539558410645, "learning_rate": 0.0002, "epoch": 7.318587504850601, "step": 9430}, {"loss": 0.7366, "grad_norm": 1.8865947723388672, "learning_rate": 0.0002, "epoch": 7.326348467209934, "step": 9440}, {"loss": 0.7825, "grad_norm": 1.8144744634628296, "learning_rate": 0.0002, "epoch": 7.334109429569267, "step": 9450}, {"loss": 0.6706, "grad_norm": 1.6930001974105835, "learning_rate": 0.0002, "epoch": 7.341870391928599, "step": 9460}, {"loss": 0.7502, "grad_norm": 1.7389107942581177, "learning_rate": 0.0002, "epoch": 7.349631354287932, "step": 9470}, {"loss": 0.7264, "grad_norm": 1.6860785484313965, "learning_rate": 0.0002, "epoch": 7.357392316647264, "step": 9480}, {"loss": 0.7593, "grad_norm": 2.2142202854156494, "learning_rate": 0.0002, "epoch": 7.365153279006597, "step": 9490}, {"loss": 0.7936, "grad_norm": 1.9988185167312622, "learning_rate": 0.0002, "epoch": 7.37291424136593, "step": 9500}, {"loss": 0.7348, "grad_norm": 1.7517266273498535, "learning_rate": 0.0002, "epoch": 7.380675203725262, "step": 9510}, {"loss": 0.7432, "grad_norm": 1.7426788806915283, "learning_rate": 0.0002, "epoch": 7.3884361660845945, "step": 9520}, {"loss": 0.7649, "grad_norm": 1.8157157897949219, "learning_rate": 0.0002, "epoch": 7.396197128443927, "step": 9530}, {"loss": 0.716, "grad_norm": 1.9826514720916748, "learning_rate": 0.0002, "epoch": 7.403958090803259, "step": 9540}, {"loss": 0.7685, "grad_norm": 1.9057488441467285, "learning_rate": 0.0002, "epoch": 7.411719053162592, "step": 9550}, {"loss": 0.7709, "grad_norm": 2.0701088905334473, "learning_rate": 0.0002, "epoch": 7.419480015521925, "step": 9560}, {"loss": 0.7438, "grad_norm": 1.8623783588409424, "learning_rate": 0.0002, "epoch": 7.427240977881257, "step": 9570}, {"loss": 0.7719, "grad_norm": 1.780672550201416, "learning_rate": 0.0002, "epoch": 7.43500194024059, "step": 9580}, {"loss": 0.7579, "grad_norm": 1.8437316417694092, "learning_rate": 0.0002, "epoch": 7.442762902599922, "step": 9590}, {"loss": 0.8044, "grad_norm": 1.9327329397201538, "learning_rate": 0.0002, "epoch": 7.450523864959255, "step": 9600}, {"loss": 0.7212, "grad_norm": 2.011535167694092, "learning_rate": 0.0002, "epoch": 7.458284827318588, "step": 9610}, {"loss": 0.7586, "grad_norm": 1.7557756900787354, "learning_rate": 0.0002, "epoch": 7.46604578967792, "step": 9620}, {"loss": 0.7461, "grad_norm": 1.8508951663970947, "learning_rate": 0.0002, "epoch": 7.473806752037253, "step": 9630}, {"loss": 0.7425, "grad_norm": 1.8236663341522217, "learning_rate": 0.0002, "epoch": 7.4815677143965855, "step": 9640}, {"loss": 0.7107, "grad_norm": 1.9308991432189941, "learning_rate": 0.0002, "epoch": 7.4893286767559175, "step": 9650}, {"loss": 0.738, "grad_norm": 1.8095420598983765, "learning_rate": 0.0002, "epoch": 7.49708963911525, "step": 9660}, {"loss": 0.7548, "grad_norm": 1.8216804265975952, "learning_rate": 0.0002, "epoch": 7.504850601474583, "step": 9670}, {"loss": 0.7494, "grad_norm": 1.8275913000106812, "learning_rate": 0.0002, "epoch": 7.512611563833915, "step": 9680}, {"loss": 0.7178, "grad_norm": 1.8217864036560059, "learning_rate": 0.0002, "epoch": 7.520372526193248, "step": 9690}, {"loss": 0.7331, "grad_norm": 1.461728572845459, "learning_rate": 0.0002, "epoch": 7.52813348855258, "step": 9700}, {"loss": 0.7597, "grad_norm": 1.6569337844848633, "learning_rate": 0.0002, "epoch": 7.535894450911913, "step": 9710}, {"loss": 0.8117, "grad_norm": 2.332101821899414, "learning_rate": 0.0002, "epoch": 7.543655413271246, "step": 9720}, {"loss": 0.7937, "grad_norm": 1.760307788848877, "learning_rate": 0.0002, "epoch": 7.551416375630578, "step": 9730}, {"loss": 0.7213, "grad_norm": 2.2455198764801025, "learning_rate": 0.0002, "epoch": 7.559177337989911, "step": 9740}, {"loss": 0.8122, "grad_norm": 1.819676399230957, "learning_rate": 0.0002, "epoch": 7.566938300349244, "step": 9750}, {"loss": 0.754, "grad_norm": 1.579603910446167, "learning_rate": 0.0002, "epoch": 7.574699262708576, "step": 9760}, {"loss": 0.7608, "grad_norm": 1.9687446355819702, "learning_rate": 0.0002, "epoch": 7.5824602250679085, "step": 9770}, {"loss": 0.813, "grad_norm": 1.859029769897461, "learning_rate": 0.0002, "epoch": 7.590221187427241, "step": 9780}, {"loss": 0.7353, "grad_norm": 1.652137279510498, "learning_rate": 0.0002, "epoch": 7.597982149786573, "step": 9790}, {"loss": 0.7388, "grad_norm": 1.6381458044052124, "learning_rate": 0.0002, "epoch": 7.605743112145906, "step": 9800}, {"loss": 0.8265, "grad_norm": 1.8706856966018677, "learning_rate": 0.0002, "epoch": 7.613504074505238, "step": 9810}, {"loss": 0.7223, "grad_norm": 1.8709853887557983, "learning_rate": 0.0002, "epoch": 7.621265036864571, "step": 9820}, {"loss": 0.7972, "grad_norm": 2.1183695793151855, "learning_rate": 0.0002, "epoch": 7.629025999223904, "step": 9830}, {"loss": 0.7645, "grad_norm": 1.6770579814910889, "learning_rate": 0.0002, "epoch": 7.636786961583236, "step": 9840}, {"loss": 0.8197, "grad_norm": 2.0952677726745605, "learning_rate": 0.0002, "epoch": 7.644547923942569, "step": 9850}, {"loss": 0.8483, "grad_norm": 1.7378171682357788, "learning_rate": 0.0002, "epoch": 7.652308886301902, "step": 9860}, {"loss": 0.7942, "grad_norm": 2.0860157012939453, "learning_rate": 0.0002, "epoch": 7.660069848661234, "step": 9870}, {"loss": 0.769, "grad_norm": 1.6986967325210571, "learning_rate": 0.0002, "epoch": 7.667830811020567, "step": 9880}, {"loss": 0.7626, "grad_norm": 1.666702151298523, "learning_rate": 0.0002, "epoch": 7.6755917733798995, "step": 9890}, {"loss": 0.7418, "grad_norm": 2.446931838989258, "learning_rate": 0.0002, "epoch": 7.6833527357392315, "step": 9900}, {"loss": 0.8333, "grad_norm": 1.6984577178955078, "learning_rate": 0.0002, "epoch": 7.691113698098564, "step": 9910}, {"loss": 0.7822, "grad_norm": 2.0732316970825195, "learning_rate": 0.0002, "epoch": 7.698874660457896, "step": 9920}, {"loss": 0.7342, "grad_norm": 1.9884073734283447, "learning_rate": 0.0002, "epoch": 7.706635622817229, "step": 9930}, {"loss": 0.7825, "grad_norm": 1.8669427633285522, "learning_rate": 0.0002, "epoch": 7.714396585176562, "step": 9940}, {"loss": 0.7722, "grad_norm": 1.6163996458053589, "learning_rate": 0.0002, "epoch": 7.722157547535894, "step": 9950}, {"loss": 0.8069, "grad_norm": 2.4492127895355225, "learning_rate": 0.0002, "epoch": 7.729918509895227, "step": 9960}, {"loss": 0.7924, "grad_norm": 1.5625537633895874, "learning_rate": 0.0002, "epoch": 7.73767947225456, "step": 9970}, {"loss": 0.8273, "grad_norm": 1.8205251693725586, "learning_rate": 0.0002, "epoch": 7.745440434613892, "step": 9980}, {"loss": 0.7892, "grad_norm": 1.9902361631393433, "learning_rate": 0.0002, "epoch": 7.753201396973225, "step": 9990}, {"loss": 0.7871, "grad_norm": 1.8294041156768799, "learning_rate": 0.0002, "epoch": 7.760962359332558, "step": 10000}, {"loss": 0.771, "grad_norm": 1.8938281536102295, "learning_rate": 0.0002, "epoch": 7.76872332169189, "step": 10010}, {"loss": 0.8304, "grad_norm": 1.9367564916610718, "learning_rate": 0.0002, "epoch": 7.7764842840512225, "step": 10020}, {"loss": 0.7437, "grad_norm": 1.9130750894546509, "learning_rate": 0.0002, "epoch": 7.7842452464105545, "step": 10030}, {"loss": 0.8359, "grad_norm": 1.931505799293518, "learning_rate": 0.0002, "epoch": 7.792006208769887, "step": 10040}, {"loss": 0.7708, "grad_norm": 2.107954740524292, "learning_rate": 0.0002, "epoch": 7.79976717112922, "step": 10050}, {"loss": 0.7922, "grad_norm": 1.8736399412155151, "learning_rate": 0.0002, "epoch": 7.807528133488552, "step": 10060}, {"loss": 0.8143, "grad_norm": 1.6943566799163818, "learning_rate": 0.0002, "epoch": 7.815289095847885, "step": 10070}, {"loss": 0.7645, "grad_norm": 2.365346908569336, "learning_rate": 0.0002, "epoch": 7.823050058207218, "step": 10080}, {"loss": 0.815, "grad_norm": 2.0044665336608887, "learning_rate": 0.0002, "epoch": 7.83081102056655, "step": 10090}, {"loss": 0.8297, "grad_norm": 1.7680124044418335, "learning_rate": 0.0002, "epoch": 7.838571982925883, "step": 10100}, {"loss": 0.7729, "grad_norm": 1.8494547605514526, "learning_rate": 0.0002, "epoch": 7.846332945285216, "step": 10110}, {"loss": 0.7909, "grad_norm": 1.822107195854187, "learning_rate": 0.0002, "epoch": 7.854093907644548, "step": 10120}, {"loss": 0.8149, "grad_norm": 1.8191527128219604, "learning_rate": 0.0002, "epoch": 7.861854870003881, "step": 10130}, {"loss": 0.8045, "grad_norm": 1.8011466264724731, "learning_rate": 0.0002, "epoch": 7.869615832363213, "step": 10140}, {"loss": 0.79, "grad_norm": 1.7087100744247437, "learning_rate": 0.0002, "epoch": 7.8773767947225455, "step": 10150}, {"loss": 0.8081, "grad_norm": 1.7698721885681152, "learning_rate": 0.0002, "epoch": 7.885137757081878, "step": 10160}, {"loss": 0.7365, "grad_norm": 1.9578531980514526, "learning_rate": 0.0002, "epoch": 7.89289871944121, "step": 10170}, {"loss": 0.8497, "grad_norm": 1.7660179138183594, "learning_rate": 0.0002, "epoch": 7.900659681800543, "step": 10180}, {"loss": 0.7858, "grad_norm": 2.0122673511505127, "learning_rate": 0.0002, "epoch": 7.908420644159876, "step": 10190}, {"loss": 0.7607, "grad_norm": 1.737443208694458, "learning_rate": 0.0002, "epoch": 7.916181606519208, "step": 10200}, {"loss": 0.8365, "grad_norm": 1.6381052732467651, "learning_rate": 0.0002, "epoch": 7.923942568878541, "step": 10210}, {"loss": 0.7855, "grad_norm": 1.8845038414001465, "learning_rate": 0.0002, "epoch": 7.931703531237874, "step": 10220}, {"loss": 0.8354, "grad_norm": 1.952194333076477, "learning_rate": 0.0002, "epoch": 7.939464493597206, "step": 10230}, {"loss": 0.8428, "grad_norm": 1.7254410982131958, "learning_rate": 0.0002, "epoch": 7.947225455956539, "step": 10240}, {"loss": 0.834, "grad_norm": 2.14776873588562, "learning_rate": 0.0002, "epoch": 7.954986418315871, "step": 10250}, {"loss": 0.8144, "grad_norm": 1.7655725479125977, "learning_rate": 0.0002, "epoch": 7.962747380675204, "step": 10260}, {"loss": 0.8176, "grad_norm": 1.7337331771850586, "learning_rate": 0.0002, "epoch": 7.9705083430345365, "step": 10270}, {"loss": 0.8652, "grad_norm": 1.7742228507995605, "learning_rate": 0.0002, "epoch": 7.9782693053938685, "step": 10280}, {"loss": 0.7788, "grad_norm": 1.892137050628662, "learning_rate": 0.0002, "epoch": 7.986030267753201, "step": 10290}, {"loss": 0.8243, "grad_norm": 1.8636300563812256, "learning_rate": 0.0002, "epoch": 7.993791230112534, "step": 10300}]}